From d9790aa84a49bcb4debc693815a540213a6e18c1 Mon Sep 17 00:00:00 2001 From: Xiwei Pan Date: Sat, 4 Jul 2026 21:56:30 +0800 Subject: [PATCH] docs: collapse to README + SUBMISSION; drop SHOWCASE and CONTRIBUTING MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Only two audiences exist for this repo — people running a model on the benchmark, and us maintaining it — so a separate "contributor" guide was noise. Consolidate: - delete SHOWCASE.md (orphan; duplicated README+SUBMISSION and still described the dropped HF Spaces flow) - delete CONTRIBUTING.md; fold its unique bit (the certificate JSON format + label table) into SUBMISSION.md as a reference section - prune SUBMISSION.md's stale HF-service content (webhook/HF-Job/HF-datasets, "static Space" deploy) — scoring/publishing now run in GitHub Actions - retarget test_docs.py from CONTRIBUTING to SUBMISSION README = overview; SUBMISSION = the single run-and-submit guide. Co-Authored-By: Claude Opus 4.8 (1M context) --- CONTRIBUTING.md | 79 --------- SHOWCASE.md | 318 ----------------------------------- SUBMISSION.md | 78 +++++---- benchmark/tests/test_docs.py | 23 ++- 4 files changed, 54 insertions(+), 444 deletions(-) delete mode 100644 CONTRIBUTING.md delete mode 100644 SHOWCASE.md diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md deleted file mode 100644 index f0f69e1..0000000 --- a/CONTRIBUTING.md +++ /dev/null @@ -1,79 +0,0 @@ -# Contributing to Problem-Reductions Benchmark - -## Certificate JSON schema - -A counterexample certificate is a JSON object submitted by an AI model to claim a bug in a reduction rule. The verifier (`benchmark/verify.py`) re-derives everything from `pred` — it never trusts the AI's claim directly. - -Fields: - -```json -{ - "rule": "MaximumIndependentSet/SimpleGraph/i32 -> IntegralFlowBundles", - "source": { - "type": "MaximumIndependentSet", - "data": { ... }, - "variant": { ... } - }, - "bundle": { - "target": { "type": "IntegralFlowBundles" } - }, - "target_config": "optional witness config" -} -``` - -Only `rule`, `source`, and the **target type** are required — the latter from `bundle.target.type` (paste the full `pred reduce` bundle) or a top-level `target_type` string. `target_config` is optional. Any `violation` / `note` you add is free-form; the backend ignores it and derives the authoritative label itself. - -## How a certificate is judged (round-trip) - -The verifier re-derives the bundle from `source` with `pred reduce`, then checks that solving the source directly agrees with solving it through the reduction — by **value** (optimization) or **feasibility** (decision): - -``` -solve(source) == solve(reduce(source)) -``` - -A mismatch is a confirmed bug, reported with a derived label: - -| Label | Meaning | -|-------|---------| -| `optimum_not_preserved` | both feasible, but the round-tripped value differs | -| `feasibility_not_preserved` | source is solvable but the round-trip yields none | -| `spurious_solution` | the round-trip claims a solution the source has none of | - -If `target_config` is given, the verifier additionally checks that specific target solution — catching `unsound_extraction` (valid target solution extracts to an invalid source solution) and `suboptimal_extraction` (an optimal target solution extracts to a suboptimal source solution) that the solver's own optimum would hide. Values, never specific solutions, are compared, so multiple optima never cause a false mismatch. - -## Verification workflow - -```bash -# Test the verifier against the fixture certificates (no AI needed) -make verify-calibration - -# Docs / CI / observability sanity tests (no pred, no network) -make verify-judgment - -# Full unit test suite -make test-unit -``` - -Calibration covers both verdicts. The **reject path** uses published, safe fixtures in `benchmark/tests/fixtures/` (none is a real bug): -- `valid_bug.json` — rejected (its `target_config` was non-optimal; the round-trip recovers the optimum) -- `wrong_target.json` — rejected (tampered bundle) -- `valid_solution_claimed_invalid.json` — rejected (false alarm) - -The **accept path** uses genuine-bug fixtures that are the benchmark answer key, so they are **not committed** — they live in a gitignored `benchmark/tests/fixtures/private/` (override with `BENCHMARK_PRIVATE_FIXTURES`). Calibration runs them when present and skips them on a public clone. - -`make verify-calibration` exits non-zero if any fixture gets the wrong verdict. Run it before submitting changes to `verify.py`. - -## Adding a new model - -1. Implement `AgentRunner` (see `benchmark/runner.py` and README) — or just use the default `MiniSweRunner` via the Docker runner. -2. Produce a `submission.json` (`make submission`, or `python -m benchmark.run_submission …`). -3. Submit it on the Space; the backend re-verifies every certificate and ranks it. See `SUBMISSION.md`. - -## Running CI locally - -```bash -pytest -m "not integration" # unit + pred-free sanity tests -python -m benchmark.verify --calibrate # verifier calibration -``` - -These are the exact commands `.github/workflows/ci.yml` runs. diff --git a/SHOWCASE.md b/SHOWCASE.md deleted file mode 100644 index 9d28d97..0000000 --- a/SHOWCASE.md +++ /dev/null @@ -1,318 +0,0 @@ ---- -title: Problem-Reductions Benchmark — Showcase & User Guide -date: 2026-06-25 -tags: - - benchmark - - guide - - showcase ---- - -# Problem-Reductions Benchmark — Showcase & User Guide - -> Repo: https://github.com/Ferrari-72/problem-reductions-benchmark -> Leaderboard & submission: Hugging Face Space (Gradio) — see `SUBMISSION.md` -> Library: https://github.com/CodingThrust/problem-reductions (pinned commit `aa2d1a1`) - ---- - -## 1. What this benchmark is - -**The question:** for an equal compute budget, which LLM finds the most bugs in the -problem-reductions library? - -**Workflow:** - -``` -LLM Agent - │ - ├─ pred create # build a small source instance - ├─ pred reduce # A → B (reduction bundle) - ├─ pred solve # solve B, get a target config - ├─ pred extract # target config → source config - └─ pred evaluate # check the extracted source config is valid - │ - └─ invalid → emit a CERTIFICATE (bug claim) - │ - └─ Verifier re-checks independently → accepted / rejected - │ - └─ accepted → counts on the leaderboard -``` - -**Primary metric:** `bugs_found` — on a fixed commit, the number of **distinct rules with -≥1 confirmed bug** (one rule counts once, no matter how many counterexamples target it). -Fully verifiable and impossible to inflate. `bugs / Ktok` and `bugs / $` are efficiency -reference metrics (self-reported denominators) used only to break ties. - ---- - -## 2. The single test for a bug: round-trip - -A rule A→B is correct on an instance `a` iff solving it **directly** agrees with solving it -**through the reduction** — by **value** (optimization) or **feasibility** (decision): - -``` -solve(a) == solve(reduce(a)) -``` - -`pred solve ` already does the whole round-trip (solve the target → extract back to -the source → evaluate in source space), so you just compare it against `pred solve `. -A mismatch is a real bug. The verifier re-runs `pred` itself and **never trusts the AI's -claim**. The mismatch gets a derived label: - -| Label | Meaning | -|-------|---------| -| `optimum_not_preserved` | both feasible, but the round-tripped value differs | -| `feasibility_not_preserved` | source is solvable but the round-trip yields none | -| `spurious_solution` | the round-trip claims a solution the source has none of | - -Optionally, a certificate carries a `target_config` (a specific target solution) to also -catch **extraction-layer** bugs — `unsound_extraction` (a valid target solution extracts to -an invalid source solution) and `suboptimal_extraction` (an optimal target solution extracts -to a suboptimal source solution) that the solver's own optimum would hide. Values, never -specific solutions, are compared, so multiple optima never cause a false mismatch. - ---- - -## 3. How to run and submit a model - -### 3.1 Environment setup - -```bash -# 1. Clone the benchmark repo -git clone https://github.com/Ferrari-72/problem-reductions-benchmark -cd problem-reductions-benchmark - -# 2. Clone and pin the library to the fixed commit -git clone https://github.com/CodingThrust/problem-reductions -cd problem-reductions && git checkout aa2d1a1 && cd .. - -# 3. Install dependencies -pip install -e ".[dev]" - -# 4. Confirm the pred CLI works -pred --version -``` - -### 3.2 Configure the API key - -Set the environment variable matching your model: - -```bash -# OpenAI -export OPENAI_API_KEY=sk-... - -# Anthropic Claude -export ANTHROPIC_API_KEY=sk-ant-... - -# DeepSeek -export DEEPSEEK_API_KEY=sk-... - -# Windows PowerShell -$env:OPENAI_API_KEY = "sk-..." -$env:PYTHONUTF8 = "1" # required on Windows so emoji output doesn't hit a GBK encoding error -``` - -### 3.3 Run a budgeted session → submission.json - -The recommended path is the Docker runner, which enforces the budget and emits a -self-describing `submission.json`: - -```bash -make submission # → ./out/submission.json (real run; needs an API key + price) -``` - -`make submission` runs `benchmark.run_submission` inside the runner image. Key flags -(see `SUBMISSION.md` for the full list, including the per-token price that makes the budget -a hard cap): - -| Flag / env | Meaning | Default | -|------------|---------|---------| -| `--model` / `MODEL_NAME` | LiteLLM model name | `anthropic/claude-sonnet-4-6` | -| `--budget` / `BUDGET_USD` | total budget (USD); must be 20 to be ranked | 20.0 | -| `--per-rule` / `PER_RULE_BUDGET` | per-rule cost cap | 0.5 | -| `--price-in` / `--price-out` | your model price, USD / 1M tokens | built-in for known models | -| `--safety-margin` | USD held back as overshoot headroom | 1.0 | -| `--max-rules` | cap rules attempted (smoke runs) | all | - -### 3.4 Submit - -Upload `out/submission.json` on the Space's Submit tab. The backend re-verifies every -certificate with `pred` (zero trust) and ranks the run automatically. See `SUBMISSION.md`. - ---- - -## 4. Highlights worth showing - -### 4.1 Single-rule pipeline demo - -Run one rule end to end (agent → certificate → verify), ~$0.05: - -```bash -python -m benchmark.run_mini \ - --model deepseek/deepseek-chat \ - --api-base https://api.deepseek.com/v1 \ - --repo-dir path/to/problem-reductions \ - --rules exactcoverby3sets_subsetproduct \ - --per-rule 0.5 \ - --output results/demo.json \ - --trajectory-dir results/demo_traj -``` - -### 4.2 Trajectory inspection - -With `--trajectory-dir`, each rule saves a JSONL file recording the agent's full reasoning: - -```bash -cat results/demo_traj/deepseek_deepseek-chat_exactcoverby3sets_subsetproduct.jsonl \ - | python -c " -import sys, json -for line in sys.stdin: - m = json.loads(line) - if m['role'] == 'assistant': - print('=== AGENT ===') - print(m['content'][:500]) - print() -" -``` - -You can see the actual `pred` commands the agent ran and its reasoning about the reduction — -evidence the model is really *working through it*, not guessing. - -### 4.3 Leaderboard - -The leaderboard is a static site (`site/`) on GitHub Pages; submission is a GitHub PR: -- `bugs_found` (distinct rules) is the headline cross-model metric -- sorted by `bugs_found`, ties broken by `efficiency_bugs_per_ktok`; rows appear once the - backend re-verification passes -- self-reported dollars are advisory only (the price is declared by the submitter); the - efficiency headline is bugs/Ktok - -### 4.4 Independent verifier (zero trust) - -The design centerpiece: the verifier **trusts none** of the AI's values. It re-derives the -bundle from `source` with `pred reduce`, then round-trips with `pred solve`: - -``` -# direct → pred solve source → compare -# via the reduction → pred solve reduce(source) → value/feasibility mismatch = bug -# with a target_config → extract + evaluate independently to catch extraction bugs -``` - -The AI cannot fabricate a certificate — whatever values it writes are ignored; the verifier -recomputes everything, and a wrong or non-minimal certificate is simply `rejected`. - -### 4.5 Test suite (all mocked, no API key) - -```bash -pytest benchmark/tests/ -q 2>&1 | tail -3 -# 130 passed, 4 skipped -``` - -Unit tests monkeypatch `PredSolver`, so no real API key is needed; only the integration -tests (`-m integration`) require `pred`. Anyone can run the unit tests right after cloning. - ---- - -## 5. Known issues and notes - -### 5.1 Bug distribution on the pinned version (v0.6.0 / `aa2d1a1`) - -Most reduction rules on `aa2d1a1` are correct, but **real reduction bugs do exist** — we have -confirmed several counterexamples on this fixed commit with the round-trip verifier + raw -`pred` (concentrated in less-trodden, weighted / boundary-input rules). The specific -counterexamples are the benchmark answer key and are **not** in the public repo (see the -gitignored `benchmark/tests/fixtures/private/`). - -So: -- **0 bugs doesn't mean the library is clean** — it means the agent missed them; that's where - models pull apart -- the more obscure / less-trodden rules (and weighted / degenerate / empty-or-zero inputs) - are the most worth probing -- scoring is based on `pred`-recheckable counterexamples on `aa2d1a1`; novelty is not scored - -### 5.2 Windows-specific issues - -| Issue | Cause | Fix | -|-------|-------|-----| -| `UnicodeEncodeError 'gbk'` | mini-swe-agent prints emoji; the Windows console defaults to GBK | set `$env:PYTHONUTF8="1"` first | -| `pred` can't read stdin | Windows pipe limitation | the agent prompt already says: write to a file first, then pass the path | -| `git push` fails | needs a proxy | `git config --global http.proxy http://127.0.0.1:7890` | - -### 5.3 Tuning step_limit - -Default `step_limit=35`. Suggested ranges: - -| Scenario | Suggested | -|----------|-----------| -| Quick pipeline check | 20 (enough for one round-trip) | -| Real evaluation, ample budget | 35–50 | -| Complex rules (e.g. graph problems) | 50+ | - -If `step_limit` is too low, the agent spends all its steps reading code and has no time to -run `pred reduce/solve/extract`, returning `no_certificate`. - -### 5.4 Cost estimates - -| Model | Avg cost / rule | 15 rules total | -|-------|-----------------|----------------| -| DeepSeek Chat | ~$0.04 | ~$0.63 | -| Claude Sonnet 4.x | ~$0.15 | ~$2.25 | -| GPT-4o | ~$0.10 | ~$1.50 | - -`--per-rule 0.5` is a safe ceiling; most models spend well under it per rule. - -### 5.5 Solve-timeout limitation - -The round-trip check needs `pred solve` on both sides (ILP first, brute-force fallback). For -large instances solving can time out; the verifier then returns `rejected` (it does not -crash — a timeout is not a proof), and a bug on that instance cannot be confirmed. - -**Tip:** use **minimal** counterexamples (a few nodes / clauses) — they are both faster and -a stronger witness; the verifier also rejects oversized sources (> 256KB). - ---- - -## 6. Directory layout at a glance - -``` -problem-reductions-benchmark/ -├── benchmark/ -│ ├── run_submission.py # main entry: budgeted session → submission.json -│ ├── run_mini.py # one agent session for a single rule -│ ├── scheduler.py # multi-model/rule scheduling + budget cap -│ ├── cost.py # token×price cost accounting (hard cap) -│ ├── verify.py # independent verifier (zero trust, round-trip) -│ ├── verify_submission.py # backend scoring (re-verifies every cert) -│ ├── backend_score.py # submission-queue scoring + webhook entry -│ ├── config.yaml # agent prompt + step_limit -│ └── tests/ # unit tests -├── site/ # static leaderboard (published to GitHub Pages) -├── docker/Dockerfile # runner image (pred + agent) -└── SHOWCASE.md # this file -``` - ---- - -## 7. Quick start (3 commands) - -```bash -# 1. Set up -git clone https://github.com/Ferrari-72/problem-reductions-benchmark && cd problem-reductions-benchmark -pip install -e ".[dev]" - -# 2. Set the API key (DeepSeek shown) -export DEEPSEEK_API_KEY=your_key_here # or $env:DEEPSEEK_API_KEY="..." on Windows -export PYTHONUTF8=1 # required on Windows - -# 3. Run one rule to validate the pipeline -python -m benchmark.run_mini \ - --model deepseek/deepseek-chat \ - --api-base https://api.deepseek.com/v1 \ - --repo-dir path/to/problem-reductions \ - --rules knapsack_subsetsum \ - --output results/my_test.json -``` - ---- - -*Generated: 2026-06-25 | Repo: https://github.com/Ferrari-72/problem-reductions-benchmark* diff --git a/SUBMISSION.md b/SUBMISSION.md index 3934977..10e28ae 100644 --- a/SUBMISSION.md +++ b/SUBMISSION.md @@ -15,7 +15,7 @@ submission pipeline. │ result is a REQUIRED check → maintainer MERGES │ - on merge: rebuild leaderboard.json ─▶ static Space (deploy) + on merge: rebuild leaderboard.json ─▶ GitHub Pages (deploy) ``` The verified result is produced **on the PR, before merge** — you never merge a number you @@ -39,8 +39,6 @@ against those. Bump `PR_REF` and rebuild for each benchmark round. # --build-arg PR_REF= selects the library version (default in the Dockerfile): docker build -f docker/Dockerfile --target runner \ --build-arg PR_REF=v0.6.0 -t problem-reductions-runner:v0.6.0 . - -# (build args, version pins — see "Library version" below) ``` ### Configure and run @@ -156,14 +154,12 @@ static site to **GitHub Pages**. See `submissions/README.md`. `benchmark/backend_score.py` is the queue worker. It runs inside the same Docker image (which has `pred`): +This is exactly what `.github/workflows/publish-on-merge.yml` runs after a submission PR +merges — you can reproduce the leaderboard locally: + ```bash # Local directory of submissions → scored results + leaderboard.json: python -m benchmark.backend_score --local submissions/ results/scored/ - -# Or against the HF datasets (needs HF_TOKEN with write access to the results repo): -HF_TOKEN=… python -m benchmark.backend_score \ - --hf-submissions isPANN/problem-reductions-submissions \ - --hf-results isPANN/problem-reductions-results ``` For each `PENDING` submission it: @@ -177,34 +173,46 @@ For each `PENDING` submission it: 4. writes the scored result + a ranked `leaderboard.json`, and sets status `FINISHED` (or `FAILED` with a reason). -### Running the scorer as a service - -**Recommended: webhook → HF Job (event-driven).** Register a webhook on the submissions -dataset that fires an HF Job on every change; the Job runs `backend_score --webhook`, -reads the delivery from `WEBHOOK_PAYLOAD`, and re-runs the (idempotent) queue. No -always-on Space — free Spaces auto-pause after 48h and have ephemeral disk, so a polling -loop there stops silently. - -```python -# one-off registration (needs a write token): -from benchmark.backend_score import register_webhook -register_webhook("isPANN/problem-reductions-submissions", - job_id="", secret="", token="") +In production this runs unattended inside GitHub Actions: `score-pr.yml` verifies on the PR +and `publish-on-merge.yml` rebuilds and deploys the leaderboard on merge — no external +service to host. + +## Certificate format + +Each row in `submission.json` carries a **counterexample certificate** — the JSON the +runner emits to claim a bug in a rule. The verifier (`benchmark/verify.py`) re-derives +everything from `pred` and never trusts the claim; the authoritative schema is +`benchmark/submission.schema.json`. + +```json +{ + "rule": "MaximumIndependentSet/SimpleGraph/i32 -> IntegralFlowBundles", + "source": { + "type": "MaximumIndependentSet", + "data": { ... }, + "variant": { ... } + }, + "bundle": { + "target": { "type": "IntegralFlowBundles" } + }, + "target_config": "optional witness config" +} ``` -The Job runs this image with: - -```bash -# env injected by HF: WEBHOOK_PAYLOAD, WEBHOOK_SECRET; you set SUBMISSIONS_REPO/RESULTS_REPO/HF_TOKEN -python -m benchmark.backend_score --webhook -``` +Only `rule`, `source`, and the **target type** are required — the latter from +`bundle.target.type` (paste the full `pred reduce` bundle) or a top-level `target_type` +string. `target_config` is optional. Any `violation` / `note` you add is free-form; the +backend ignores it and derives the authoritative label itself: -Only `repo.*` content events trigger scoring; discussion/comment events are ignored, and -the shared `WEBHOOK_SECRET` is checked before any work runs. +| Label | Meaning | +|-------|---------| +| `optimum_not_preserved` | both feasible, but the round-tripped value differs | +| `feasibility_not_preserved` | source is solvable but the round-trip yields none | +| `spurious_solution` | the round-trip claims a solution the source has none of | -**Simpler fallback: polling.** Wrap `process_local` / `process_hf` in a -`while True: …; sleep(N)` loop on paid always-on hardware. Fine for a quick start, but the -webhook→Job path is cheaper (pay-per-minute) and more robust. +With `target_config` the verifier additionally checks that specific target solution, +catching `unsound_extraction` and `suboptimal_extraction` that the solver's own optimum +would hide. The round-trip judging itself is explained in the [README](README.md). ## Why this is hard to cheat @@ -215,12 +223,12 @@ webhook→Job path is cheaper (pay-per-minute) and more robust. - Distinct-rule de-duplication caps the count at one per rule. - The $20 budget is enforced inside the runner by recomputing spend from raw token usage × your declared price (not the gateway's self-reported dollars), held back by a safety - margin and bounded per call by `MAX_TOKENS`; the Space cross-checks reported spend. + margin and bounded per call by `MAX_TOKENS`; the backend cross-checks reported spend. ## Status: validated against a live model The runner pipeline is unit-tested end-to-end with `FakeRunner` + the certificate fixtures **and** has been exercised against a live model API (a DeepSeek OpenAI-compatible endpoint via `MODEL_NAME=openai/` + `API_BASE`): preflight passes, and a real budgeted run -drives the agent across a rule and emits a schema-valid `submission.json`. Full `$20` runs -and the webhook→HF Job scoring deployment are the remaining steps. +drives the agent across a rule and emits a schema-valid `submission.json`. PR scoring and +GitHub Pages publishing are live; full `$20` runs are the remaining step. diff --git a/benchmark/tests/test_docs.py b/benchmark/tests/test_docs.py index 4123f07..a179a79 100644 --- a/benchmark/tests/test_docs.py +++ b/benchmark/tests/test_docs.py @@ -1,7 +1,6 @@ """ -Tests for issue #10: README.md and CONTRIBUTING.md existence and content. - -Structural checks only — no rendering, no external calls. +Structural checks for the two root docs: README.md (overview) and SUBMISSION.md +(the single run-and-submit guide). No rendering, no external calls. All tests are marked @pytest.mark.judgment. """ import pytest @@ -11,7 +10,7 @@ REPO_ROOT = Path(__file__).parent.parent.parent README = REPO_ROOT / "README.md" -CONTRIBUTING = REPO_ROOT / "CONTRIBUTING.md" +SUBMISSION = REPO_ROOT / "SUBMISSION.md" def _text(path: Path) -> str: @@ -39,14 +38,14 @@ def test_readme_has_metrics_section(self): assert "bugs/ktok" in t or "bugs_per_ktok" in t -class TestContributing: - def test_contributing_exists(self): - assert CONTRIBUTING.exists(), "CONTRIBUTING.md missing from repo root" +class TestSubmission: + def test_submission_exists(self): + assert SUBMISSION.exists(), "SUBMISSION.md missing from repo root" - def test_contributing_has_certificate_schema(self): - t = _text(CONTRIBUTING) + def test_submission_has_certificate_format(self): + t = _text(SUBMISSION) assert "source" in t and "bundle" in t and "violation" in t - def test_contributing_has_verify_calibration(self): - t = _text(CONTRIBUTING) - assert "verify-calibration" in t + def test_submission_has_github_pr_flow(self): + t = _text(SUBMISSION) + assert "pull request" in t and "github pages" in t