Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
59 changes: 52 additions & 7 deletions config/lanes.example.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -33,7 +33,7 @@
lanes:
- id: claude-native
kind: cli
model: claude-opus-4-7
model: claude-opus@latest # self-updating: tracks the newest PRICED claude-opus (e.g. claude-opus-4-8) — never a hard-pinned version
trust_mode: full
costBasis: subscription
provenance: anthropic
Expand Down Expand Up @@ -106,14 +106,14 @@ lanes:
# run with TOKENMAXED_DISABLE=1 so they never re-enter routing / recurse.)
- id: claude-haiku
kind: cli
model: claude-haiku-4-5-20251001
model: claude-haiku@latest # self-updating; the {model} arg below spawns the resolved id
trust_mode: full
costBasis: subscription
provenance: anthropic
jurisdiction: US
execution_mode: answer-only
command: claude
args: ["-p", "--model", "claude-haiku-4-5-20251001"]
args: ["-p", "--model", "{model}"] # {model} ⇒ the resolved newest priced claude-haiku
# Secondary / in-family manager. Codex above is the default host-turn reviewer
# (first eligible in file order). On the escalation path this lane can only
# independently review an offloaded output in categories where it isn't weaker
Expand All @@ -132,11 +132,37 @@ lanes:
explain: 0.82
codegen: 0.72

# Full-access Sonnet (Claude Code): a stronger in-family offload than Haiku for
# bounded subtasks, still on the same subscription (no metered $). Self-updating via
# claude-sonnet@latest; the {model} arg spawns the resolved id. As a manager it is a
# capable INDEPENDENT reviewer for Codex's strong categories (which Haiku can't cover).
- id: claude-sonnet
kind: cli
model: claude-sonnet@latest
trust_mode: full
costBasis: subscription
provenance: anthropic
jurisdiction: US
execution_mode: answer-only
command: claude
args: ["-p", "--model", "{model}"] # {model} ⇒ the resolved newest priced claude-sonnet
roles: [manager]
manager_allowed: true
capability:
feature: 0.90
refactor: 0.86
bugfix: 0.85
boilerplate: 0.88
explain: 0.88
codegen: 0.85
docs: 0.86

# --- Other provider lanes (SUPPORTED) ---------------------------------------
# Popular vendors, shipped as SAFE inert TEMPLATES: each is `blocked` (never
# selected) until YOU pick a trust_mode. (The only enabled defaults ABOVE are the
# host `claude-native`, `codex-cli` (the default reviewer), and the in-family
# `claude-haiku` — all first-party or availability-gated. `ollama-llama3` ships
# selected) until YOU pick a trust_mode. (The enabled defaults ABOVE are the host
# `claude-native`, `codex-cli` (the default reviewer), and the in-family Claude Code
# lanes `claude-haiku` + `claude-sonnet` — all first-party or availability-gated.
# `ollama-llama3` ships
# `blocked` too: a local server isn't assumed. In your own config, set every
# lane's trust deliberately.) The vendor lanes here add nothing to your trust
# surface until you change `blocked`. Trust ladder: blocked < worker < reader < full.
Expand All @@ -154,6 +180,10 @@ lanes:
# BYOK api lanes: put the key in env TOKENMAXED_KEY_<authHandle> (never in this file).
# api lanes speak the OpenAI /chat/completions schema — point `endpoint` at your
# provider's OpenAI-COMPATIBLE chat-completions URL (not a vendor-proprietary one).
# COST: `costBasis` is YOUR billing model — NOT implied by `api`. Many vendors (e.g.
# MiniMax) are a flat-rate SUBSCRIPTION token ⇒ `costBasis: subscription` (treated as
# $0 and preferred by routing, like a CLI subscription); pay-per-token ⇒ `metered`.
# /tokenmaxed:setup ASKS you per api lane — TokenMaxed never assumes metered.
# CLI lanes (gemini-cli, kimi-cli) can only be `full` or `blocked` — worker/reader
# executors are API-only, so a CLI lane set to worker/reader loads but never routes.

Expand Down Expand Up @@ -199,9 +229,24 @@ lanes:
# automatically (family from the price table); for an
# unpriced pin add `model_family: minimax`.
trust_mode: blocked # → worker (no repo) or reader (repo-read; see above) or full
costBasis: metered
costBasis: subscription # MiniMax is typically a flat-rate SUBSCRIPTION token (not pay-per-token); /tokenmaxed:setup confirms this with you
provenance: minimax
jurisdiction: CN
endpoint: https://api.minimax.io/v1/chat/completions # use the OpenAI-compatible URL
authHandle: MINIMAX
capability: { codegen: 0.80, boilerplate: 0.80, docs: 0.76 }

# Sonnet via Anthropic API (BYOK) — an OPT-IN alternative to the Claude Code CLI
# lane above, NOT a default. When the `claude` CLI is available, the subscription
# CLI lane runs Sonnet (no metered $); this is here only if you deliberately want
# Sonnet over the API. Flip trust_mode to worker/full to enable. Self-updating.
- id: claude-sonnet-api
kind: api
model: claude-sonnet@latest
trust_mode: blocked
costBasis: metered
provenance: anthropic
jurisdiction: US
endpoint: https://api.anthropic.com/v1/chat/completions # Anthropic's OpenAI-compatible endpoint
authHandle: ANTHROPIC
capability: { codegen: 0.85, boilerplate: 0.86, docs: 0.86, explain: 0.88 }
4 changes: 3 additions & 1 deletion config/prices.seed.json
Original file line number Diff line number Diff line change
@@ -1,8 +1,10 @@
{
"schema_version": 2,
"frontier_model": "claude-opus-4-7",
"frontier_model": "claude-opus-4-8",
"models": {
"claude-opus-4-8": { "inputPer1M": 5, "outputPer1M": 25, "family": "claude-opus", "released": "2026-05-15" },
"claude-opus-4-7": { "inputPer1M": 15, "outputPer1M": 75, "family": "claude-opus", "released": "2026-01-15" },
"claude-sonnet-4-6": { "inputPer1M": 3, "outputPer1M": 15, "family": "claude-sonnet", "released": "2025-11-14" },
"claude-haiku-4-5-20251001": { "inputPer1M": 1, "outputPer1M": 5, "family": "claude-haiku", "released": "2025-10-01" },
"gpt-5.5": { "inputPer1M": 10, "outputPer1M": 30, "family": "gpt", "released": "2026-02-01" },
"llama3.1:8b": { "inputPer1M": 0, "outputPer1M": 0, "family": "llama", "released": "2024-07-23" },
Expand Down
3 changes: 2 additions & 1 deletion packages/core/src/index.ts
Original file line number Diff line number Diff line change
Expand Up @@ -34,10 +34,11 @@ export {
pricedIdsInFamily,
newestPricedInFamily,
resolveLaneModel,
staleAgainstPriceTable,
sameFamily,
assessStaleness,
} from './model-freshness.ts';
export type { ModelSpec, FamilyModel, StalenessReport } from './model-freshness.ts';
export type { ModelSpec, FamilyModel, StalenessReport, PriceTableStaleness } from './model-freshness.ts';
export {
LedgerError,
EVENT_FIELDS,
Expand Down
44 changes: 44 additions & 0 deletions packages/core/src/model-freshness.ts
Original file line number Diff line number Diff line change
Expand Up @@ -102,6 +102,50 @@ export function resolveLaneModel<L extends { model: string }>(lane: L, table: Pr
return concrete ? { ...lane, model: concrete } : lane;
}

/** A price-table-derived staleness finding (egress-free; covers any lane kind). */
export interface PriceTableStaleness {
laneId: string;
/** The family the comparison was made within. */
family: string;
/** The concrete model the lane currently uses (an `@latest` alias already resolved). */
pinned: string;
/** The newest priced model in the family — what the lane should be on. */
newest: string;
}

/**
* Check each lane against the PRICE TABLE ONLY (no vendor `/models` call), so it is
* safe to run on the session-start path for EVERY lane kind — including the CLI/native
* Claude lanes that the live, api-only staleness check never sees. A lane is flagged
* when the concrete model it would use (a `<family>@latest` alias resolved to the
* newest priced id; a concrete pin taken as-is) is OLDER than the newest priced model
* in its family. The family is taken from a `<family>@latest` stem, the lane's explicit
* `model_family`, or the price table's metadata for the concrete id — NEVER guessed
* from the id string. Lanes with no resolvable family (or already on the newest priced
* model) produce no finding. An `@latest` lane is therefore self-correcting: it always
* resolves to the newest priced model, so it is never flagged. Pure — no I/O.
*/
export function staleAgainstPriceTable<L extends { id: string; model: string; model_family?: string }>(
lanes: readonly L[],
table: PriceTable,
): PriceTableStaleness[] {
const out: PriceTableStaleness[] = [];
for (const lane of lanes) {
const spec = parseModelAlias(lane.model);
// The concrete model the lane would actually use.
const pinned = spec.latest ? newestPricedInFamily(table, spec.family) : spec.id;
if (!pinned) continue; // an @latest alias with no priced family member ⇒ handled elsewhere.
const family = spec.latest ? spec.family : (lane.model_family ?? table.models[pinned]?.family);
if (!family) continue; // unknown family ⇒ can't judge (no prefix guessing).
const newest = newestPricedInFamily(table, family);
// Flag only when a strictly-newer priced model exists in the family.
if (newest && newest !== pinned && compareNewestFirst(table, newest, pinned) < 0) {
out.push({ laneId: lane.id, family, pinned, newest });
}
}
return out;
}

/** A model id (optionally with a vendor `created` epoch) for family matching. */
export interface FamilyModel {
id: string;
Expand Down
8 changes: 7 additions & 1 deletion packages/core/src/node.ts
Original file line number Diff line number Diff line change
Expand Up @@ -449,7 +449,13 @@ export function makeCliExecutor(spawnImpl?: SpawnLike): TrustedExecFn {
return async (lane, instruction, attachments) => {
if (!lane.command) throw new Error(`cli lane "${lane.id}" has no command configured`);
const input = combinedPrompt(instruction, attachments);
const res = spawn(lane.command, lane.args ?? [], { input, encoding: 'utf8', maxBuffer: 64 * 1024 * 1024 });
// `{model}` placeholder substitution: a CLI lane can pass `--model {model}` in its
// args instead of hard-pinning a version, so the spawn always uses the lane's
// CURRENT model. By the time a lane reaches the executor its `model` is already the
// concrete, price-table-resolved id (a `<family>@latest` alias has been resolved on
// the routing path), so this keeps CLI lanes self-updating with no stale literal.
const args = (lane.args ?? []).map((a) => a.replaceAll('{model}', lane.model));
const res = spawn(lane.command, args, { input, encoding: 'utf8', maxBuffer: 64 * 1024 * 1024 });
if (res.error) throw new LaneFailure('provider_error', `cli lane "${lane.id}" failed to spawn`);
if (res.status !== 0) throw new LaneFailure('provider_error', `cli lane "${lane.id}" exited with status ${res.status}`);
return { resultText: res.stdout ?? '' }; // CLIs rarely report tokens ⇒ estimated downstream
Expand Down
21 changes: 10 additions & 11 deletions packages/core/src/registry.ts
Original file line number Diff line number Diff line change
Expand Up @@ -226,17 +226,16 @@ function parseLane(entry: unknown, index: number): Lane {
throw new LaneConfigError(`${at('endpoint')}: an api lane requires an endpoint.`);
}
}
// A `<family>@latest` alias is resolved against the price table at routing time.
// Reject anything ending in "@latest" that isn't a well-formed alias on an api
// lane: bare "@latest" (empty family stem) would otherwise parse as a concrete id
// and could reach execution literally; CLI/local lanes pin a concrete model.
if (lane.model.trim().endsWith('@latest')) {
if (lane.kind !== 'api') {
throw new LaneConfigError(`${at('model')}: a "<family>@latest" alias is only supported on api lanes.`);
}
if (!parseModelAlias(lane.model).latest) {
throw new LaneConfigError(`${at('model')}: "@latest" needs a family stem, e.g. "minimax@latest".`);
}
// A `<family>@latest` alias is resolved against the price table at routing time
// (and on the summary path), so a lane tracks the newest priced model in its family
// instead of hard-pinning a version that silently goes stale. Supported on ANY lane
// kind: api lanes send the resolved id in the request body; cli/local lanes spawn the
// resolved id via a `{model}` arg placeholder (see makeCliExecutor) or, for the native
// host lane, use it for pricing/display only. We only reject a MALFORMED alias — bare
// "@latest" with an empty family stem — which would otherwise parse as a concrete id
// and could reach execution literally.
if (lane.model.trim().endsWith('@latest') && !parseModelAlias(lane.model).latest) {
throw new LaneConfigError(`${at('model')}: "@latest" needs a family stem, e.g. "claude-opus@latest".`);
}
return lane;
}
Expand Down
31 changes: 31 additions & 0 deletions packages/core/test/model-freshness.test.ts
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,7 @@ import {
pricedIdsInFamily,
newestPricedInFamily,
resolveLaneModel,
staleAgainstPriceTable,
sameFamily,
assessStaleness,
} from '../src/model-freshness.ts';
Expand Down Expand Up @@ -127,3 +128,33 @@ test('newestPricedInFamily falls back to version order when releases are absent'
};
assert.equal(newestPricedInFamily(noDates, 'foo'), 'foo-10'); // 10 > 2 numerically
});

// --- staleAgainstPriceTable: the egress-free "are the latest models in use?" check ---

test('staleAgainstPriceTable flags a concrete pin behind the newest priced in family', () => {
// Covers ANY lane kind — here a cli lane pinned to an older minimax.
const found = staleAgainstPriceTable([{ id: 'l', model: 'minimax-m2' }], table);
assert.equal(found.length, 1);
assert.deepEqual(found[0], { laneId: 'l', family: 'minimax', pinned: 'minimax-m2', newest: 'minimax-m3' });
});

test('staleAgainstPriceTable does NOT flag a lane already on the newest priced model', () => {
assert.deepEqual(staleAgainstPriceTable([{ id: 'l', model: 'minimax-m3' }], table), []);
});

test('staleAgainstPriceTable never flags a <family>@latest lane (it resolves to newest)', () => {
// This is why @latest is the fix: a self-updating lane is never "behind".
assert.deepEqual(staleAgainstPriceTable([{ id: 'l', model: 'minimax@latest' }], table), []);
});

test('staleAgainstPriceTable skips a pin with no resolvable family (no prefix guessing)', () => {
// `opus` has no family metadata and the lane sets no model_family ⇒ cannot judge.
assert.deepEqual(staleAgainstPriceTable([{ id: 'l', model: 'opus' }], table), []);
});

test('staleAgainstPriceTable uses an explicit model_family to judge an unpriced pin', () => {
const found = staleAgainstPriceTable([{ id: 'l', model: 'minimax-m1', model_family: 'minimax' }], table);
assert.equal(found.length, 1);
assert.equal(found[0]!.pinned, 'minimax-m1');
assert.equal(found[0]!.newest, 'minimax-m3');
});
17 changes: 17 additions & 0 deletions packages/core/test/node-executors.test.ts
Original file line number Diff line number Diff line change
Expand Up @@ -64,6 +64,23 @@ test('makeCliExecutor throws on a non-zero exit (so runTask degrades)', async ()
await assert.rejects(() => exec(codexCli, 'x'));
});

test('makeCliExecutor substitutes the {model} placeholder with the resolved lane model', async () => {
// MODEL-FRESHNESS: a cli lane uses `--model {model}` instead of a hard-pinned id, so
// the spawn always runs the lane's current (price-table-resolved) model.
let seen: readonly string[] = [];
const exec = makeCliExecutor((_cmd, args) => {
seen = args;
return { status: 0, stdout: 'ok' };
});
const sonnet: Lane = {
id: 'claude-sonnet', kind: 'cli', model: 'claude-sonnet-4-6', trust_mode: 'full',
costBasis: 'subscription', provenance: 'anthropic', jurisdiction: 'US',
command: 'claude', args: ['-p', '--model', '{model}'], capability: { codegen: 0.85 },
};
await exec(sonnet, 'do it');
assert.deepEqual(seen, ['-p', '--model', 'claude-sonnet-4-6']); // {model} ⇒ lane.model
});

test('makeOllamaExecutor posts to /api/generate and maps eval counts to usage', async () => {
let url: string | undefined;
const exec = makeOllamaExecutor(async (u) => {
Expand Down
13 changes: 11 additions & 2 deletions packages/core/test/price.test.ts
Original file line number Diff line number Diff line change
Expand Up @@ -171,10 +171,19 @@ test('validatePriceTable rejects a non-object', () => {
test('loadPriceTable reads and validates the shipped seed file', () => {
const seedPath = new URL('../../../config/prices.seed.json', import.meta.url);
const t = loadPriceTable(seedPath);
assert.equal(t.frontier_model, 'claude-opus-4-7');
// MODEL-FRESHNESS: the frontier baseline tracks the current most-capable Claude
// (claude-opus-4-8); the previous frontier stays priced for back-compat.
assert.equal(t.frontier_model, 'claude-opus-4-8');
assert.equal(t.schema_version, 2); // MODEL-FRESHNESS: metadata-carrying seed
assert.equal(Object.keys(t.models).length, 9);
assert.equal(Object.keys(t.models).length, 11);
assert.equal(t.models['claude-opus-4-8']?.inputPer1M, 5);
assert.equal(t.models['claude-opus-4-8']?.outputPer1M, 25);
assert.equal(t.models['claude-opus-4-8']?.family, 'claude-opus');
assert.equal(t.models['claude-opus-4-7']?.inputPer1M, 15);
// Sonnet 4.6 priced + family-tagged so claude-sonnet@latest resolves to it.
assert.equal(t.models['claude-sonnet-4-6']?.inputPer1M, 3);
assert.equal(t.models['claude-sonnet-4-6']?.outputPer1M, 15);
assert.equal(t.models['claude-sonnet-4-6']?.family, 'claude-sonnet');
assert.equal(t.models['claude-haiku-4-5-20251001']?.outputPer1M, 5);
// F2-S5: metered vendor models priced so opted-up reader/worker lanes are routable.
assert.ok(t.models['glm-5.1']);
Expand Down
Loading
Loading