problem-reductions-benchmark/Makefile at main · CodingThrust/problem-reductions-benchmark · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
# Makefile for problem-reductions-benchmark
# Run from the repo root (next to benchmark/).
#
# Key targets:
#   test                 Run full pytest suite (unit + integration)
#   test-unit            Run only unit tests (no real repo/pred needed)
#   verify-calibration   Test the verifier against known fixtures (no AI needed)
#   preflight            Validate submission.env with one tiny real call before a full run
#   run                  Run the benchmark via Docker → out/submission.json (does NOT upload)
#
# Model + key + price for the real run live in submission.env (any provider — see
# submission.env.example); preflight/submission read it via --env-file. REPO_DIR is only
# for the local-clone targets (audit).

REPO_DIR ?= ../problem-reductions
# PR_REF = the problem-reductions version this benchmark round targets (tag or commit).
# It drives BOTH the build arg and the image tag, so bumping the round is one place:
#   make runner-build PR_REF=v0.7.0   →   builds + tags problem-reductions-runner:v0.7.0
PR_REF   ?= v0.6.0
IMAGE    ?= problem-reductions-runner:$(PR_REF)
SUBS_DIR ?= submissions
SCORED   ?= results/scored
ENV_FILE ?= submission.env

.PHONY: test test-unit verify-calibration verify-judgment audit install-deps help runner-build preflight run score-local serve

## Run the full test suite (unit + integration tests that need real repo).
test:
	pytest -v

## Run only unit tests — no real repo or pred binary required.
test-unit:
	pytest -v -m "not integration"

## Pred-free sanity tests (docs, CI workflow, trajectory).
verify-judgment:
	pytest -v -m "judgment"

## Test the verifier against the fixture certificates — no AI, no API keys needed.
## Must pass before any real session is run.
verify-calibration:
	python -m benchmark.verify --calibrate

## Build the dockerized submission runner image (compiles pred at PR_REF + bundles the agent).
runner-build:
	docker build -f docker/Dockerfile --target runner \
	  --build-arg PR_REF=$(PR_REF) -t $(IMAGE) .

## Preflight: validate submission.env with one tiny real API call + pred/rules checks,
## BEFORE committing to a full $20 run. Spends a fraction of a cent. (The no-API wiring of
## the runner itself is covered by the pytest suite, not a make target.)
preflight:
	@if [ ! -f "$(ENV_FILE)" ]; then \
	  echo "No $(ENV_FILE) — copy submission.env.example and fill it in first"; exit 1; fi
	docker run --rm --env-file "$(ENV_FILE)" $(IMAGE) --preflight

## Run the budgeted bug-finding agent via Docker → writes ./out/submission.json.
## This RUNS the benchmark locally; it does NOT submit — submitting is a separate step
## (open a GitHub PR adding the file, see CONTRIBUTING.md). Config lives in submission.env
## (copy submission.env.example); run `make preflight` first to validate it.
run:
	@if [ ! -f "$(ENV_FILE)" ]; then \
	  echo "No $(ENV_FILE) — copy submission.env.example and fill it in (then: make preflight)"; exit 1; fi
	mkdir -p out
	docker run --rm --env-file "$(ENV_FILE)" -v "$(PWD)/out:/out" $(IMAGE)
	@echo "Wrote ./out/submission.json — now submit it via a GitHub PR (see CONTRIBUTING.md)."

## Score all submissions in SUBS_DIR with the zero-trust backend (needs pred).
## Writes scored results + leaderboard.json into SCORED.
score-local:
	python -m benchmark.backend_score --local $(SUBS_DIR) $(SCORED)

## Preview the leaderboard site locally (it's published to GitHub Pages on merge).
serve:
	@echo "Serving site/ at http://localhost:8000  (Ctrl-C to stop)"
	cd site && python3 -m http.server 8000

## Audit pred CLI capabilities against the pinned library commit.
audit:
	python -m benchmark.pred_audit $(REPO_DIR)

## Install Python dependencies.
install-deps:
	pip install -r benchmark/requirements.txt

help:
	@echo "Targets:"
	@echo "  test                Run full pytest suite"
	@echo "  test-unit           Run unit tests only (no real repo needed)"
	@echo "  verify-calibration  Test verifier against fixtures (no AI needed)"
	@echo "  runner-build        Build the dockerized submission runner image"
	@echo "  preflight           Validate submission.env (1 tiny real call) before a full run"
	@echo "  run                 Run the benchmark via Docker → out/submission.json (not upload)"
	@echo "  score-local         Score SUBS_DIR submissions with the backend"
	@echo "  serve               Preview the leaderboard site locally (published via Pages on merge)"
	@echo "  audit               Audit pred CLI capabilities"
	@echo "  install-deps        Install Python requirements"
	@echo ""
	@echo "Variables:"
	@echo "  REPO_DIR=$(REPO_DIR)"
	@echo "  ENV_FILE=$(ENV_FILE)  (model/key/price for preflight + submission)"