diff --git a/benchmark/requirements.txt b/benchmark/requirements.txt
index 397b0f0..a2eef85 100644
--- a/benchmark/requirements.txt
+++ b/benchmark/requirements.txt
@@ -1,4 +1,3 @@
-anthropic==0.52.0
 mini-swe-agent==2.2.8
 pyyaml>=6.0
 pytest>=8.0
diff --git a/benchmark/run.py b/benchmark/run.py
deleted file mode 100644
index 116780e..0000000
--- a/benchmark/run.py
+++ /dev/null
@@ -1,201 +0,0 @@
-#!/usr/bin/env python3
-"""
-Minimal SWE-like benchmark for problem-reductions bug finding.
-Pattern: rule file → AI → inject test → cargo test → record result.
-"""
-
-import argparse
-import json
-import os
-import re
-import subprocess
-import sys
-import tempfile
-from pathlib import Path
-
-import anthropic
-
-REPO_URL = "https://github.com/CodingThrust/problem-reductions"
-
-PRICING = {
-    "claude-sonnet-4-6":        {"input": 3.0,  "output": 15.0},
-    "claude-haiku-4-5-20251001": {"input": 0.8,  "output": 4.0},
-    "claude-opus-4-7":           {"input": 15.0, "output": 75.0},
-}
-
-SYSTEM = """\
-You are a Rust expert finding bugs in problem reduction rules.
-
-Each rule implements two functions:
-- reduce_to(): transforms source problem A into target problem B
-- extract_solution(): maps a target solution back to source space
-
-A rule is BUGGY if the round-trip fails: given A, reduce to B, solve B to get
-solution s, extract s → A', but A' is not a valid solution to A.
-
-Write a single #[test] function that:
-1. Constructs a specific source problem instance
-2. Calls reduce_to() to get the target problem
-3. Constructs or derives a target solution (use a brute-force solver if available)
-4. Calls extract_solution() with that solution
-5. Asserts the extracted result is a valid solution to the source problem
-
-If the rule has a bug, your test FAILS. If correct, it PASSES.
-Output ONLY a Rust code block with the test function. No explanation.\
-"""
-
-
-def cost_usd(usage, model: str) -> float:
-    p = PRICING.get(model, {"input": 3.0, "output": 15.0})
-    return (usage.input_tokens * p["input"] + usage.output_tokens * p["output"]) / 1_000_000
-
-
-def extract_rust_block(text: str) -> str | None:
-    m = re.search(r"```rust\n(.*?)```", text, re.DOTALL)
-    return m.group(1).strip() if m else None
-
-
-def find_test_file(repo_dir: str, rule_name: str) -> Path | None:
-    """Find the unit test file for a rule by scanning the rule file for #[path = ...]."""
-    rule_path = Path(repo_dir) / "src" / "rules" / f"{rule_name}.rs"
-    if not rule_path.exists():
-        return None
-    content = rule_path.read_text(encoding="utf-8")
-    m = re.search(r'#\[path\s*=\s*"([^"]+)"\]', content)
-    if m:
-        rel = m.group(1)
-        return (Path(repo_dir) / "src" / "rules" / rel).resolve()
-    # Fallback: conventional location
-    p = Path(repo_dir) / "src" / "unit_tests" / "rules" / f"{rule_name}.rs"
-    return p if p.exists() else None
-
-
-def run_cargo_test(repo_dir: str, test_name: str) -> tuple[bool, str]:
-    """Returns (bug_found, output). bug_found=True means test failed (bug detected)."""
-    result = subprocess.run(
-        ["cargo", "test", test_name, "--", "--nocapture"],
-        cwd=repo_dir, capture_output=True, text=True, timeout=180
-    )
-    output = result.stdout + result.stderr
-    # Distinguish compile error from test failure
-    if "error[" in output or "error: " in output:
-        return False, "compile_error: " + output[:500]
-    return result.returncode != 0, output[:500]
-
-
-def benchmark_rule(client, model: str, repo_dir: str, rule_name: str) -> dict:
-    rule_path = Path(repo_dir) / "src" / "rules" / f"{rule_name}.rs"
-    rule_content = rule_path.read_text(encoding="utf-8")
-
-    test_file = find_test_file(repo_dir, rule_name)
-    test_content = test_file.read_text(encoding="utf-8") if test_file else "// no existing tests"
-
-    safe_name = rule_name.replace("-", "_")
-    user_msg = (
-        f"Rule file `{rule_name}.rs`:\n```rust\n{rule_content[:5000]}\n```\n\n"
-        f"Existing tests (for import patterns):\n```rust\n{test_content[:2000]}\n```\n\n"
-        f"Write test `test_bug_{safe_name}` that probes for bugs in reduce_to() or extract_solution()."
-    )
-
-    resp = client.messages.create(
-        model=model, max_tokens=1024,
-        system=SYSTEM,
-        messages=[{"role": "user", "content": user_msg}]
-    )
-
-    cost = cost_usd(resp.usage, model)
-    test_code = extract_rust_block(resp.content[0].text)
-
-    if not test_code:
-        return {"rule": rule_name, "result": "parse_error", "cost": cost}
-
-    if not test_file:
-        return {"rule": rule_name, "result": "no_test_file", "cost": cost}
-
-    original = test_file.read_text(encoding="utf-8")
-    test_file.write_text(original + f"\n\n{test_code}\n", encoding="utf-8")
-
-    test_name = f"test_bug_{safe_name}"
-    try:
-        bug_found, output = run_cargo_test(repo_dir, test_name)
-    except subprocess.TimeoutExpired:
-        test_file.write_text(original, encoding="utf-8")
-        return {"rule": rule_name, "result": "timeout", "cost": cost}
-    finally:
-        test_file.write_text(original, encoding="utf-8")  # always restore
-
-    return {
-        "rule": rule_name,
-        "result": "bug_found" if bug_found else "no_bug",
-        "cost": cost,
-        "cargo_output": output,
-        "test_code": test_code,
-    }
-
-
-# Rules to skip (not reduction rules)
-SKIP_RULES = {"mod", "traits", "graph_helpers", "analysis", "cost", "registry", "graph"}
-
-
-def list_rules(repo_dir: str) -> list[str]:
-    rules_dir = Path(repo_dir) / "src" / "rules"
-    return [
-        f.stem for f in sorted(rules_dir.glob("*.rs"))
-        if f.stem not in SKIP_RULES
-    ]
-
-
-def main():
-    parser = argparse.ArgumentParser(description="Problem-reductions bug-finding benchmark")
-    parser.add_argument("--model", default="claude-sonnet-4-6")
-    parser.add_argument("--budget", type=float, default=20.0, help="USD budget")
-    parser.add_argument("--rules", nargs="*", help="specific rule names (default: all)")
-    parser.add_argument("--output", default="results/results.json")
-    parser.add_argument("--repo-dir", help="use existing local clone instead of cloning")
-    args = parser.parse_args()
-
-    client = anthropic.Anthropic()
-
-    def run(repo_dir: str):
-        rules = args.rules if args.rules else list_rules(repo_dir)
-        results, total_cost, bugs_found = [], 0.0, 0
-
-        for rule_name in rules:
-            if total_cost >= args.budget:
-                print("Budget exhausted.")
-                break
-            print(f"  {rule_name}...", end=" ", flush=True)
-            r = benchmark_rule(client, args.model, repo_dir, rule_name)
-            results.append(r)
-            total_cost += r["cost"]
-            status = "BUG FOUND" if r["result"] == "bug_found" else r["result"]
-            print(f"{status} (${r['cost']:.4f})")
-            if r["result"] == "bug_found":
-                bugs_found += 1
-
-        summary = {
-            "model": args.model,
-            "bugs_found": bugs_found,
-            "total_cost_usd": round(total_cost, 6),
-            "efficiency_bugs_per_dollar": round(bugs_found / total_cost, 4) if total_cost else 0,
-            "rules_tested": len(results),
-            "results": results,
-        }
-        out = Path(args.output)
-        out.parent.mkdir(parents=True, exist_ok=True)
-        out.write_text(json.dumps(summary, indent=2), encoding="utf-8")
-        print(f"\n{bugs_found} bugs found | ${total_cost:.4f} spent | "
-              f"{summary['efficiency_bugs_per_dollar']:.2f} bugs/$")
-        print(f"Results → {args.output}")
-
-    if args.repo_dir:
-        run(args.repo_dir)
-    else:
-        with tempfile.TemporaryDirectory() as tmpdir:
-            print(f"Cloning {REPO_URL}...")
-            subprocess.run(["git", "clone", "--depth=1", REPO_URL, tmpdir], check=True)
-            run(tmpdir)
-
-
-if __name__ == "__main__":
-    main()
diff --git a/site/bug-verification.png b/site/bug-verification.png
new file mode 100644
index 0000000..19ec758
Binary files /dev/null and b/site/bug-verification.png differ