From 888db70f89d913ebfce94ba5851e854b4e8e4046 Mon Sep 17 00:00:00 2001 From: Jay Shitre Date: Thu, 11 Jun 2026 12:32:41 +0530 Subject: [PATCH 1/3] fix(windows): retry spurious spawnSync ETIMEDOUT in the OpenCode plugin (#46) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit When OpenCode fires hooks for a concurrent batch (e.g. multiple file writes in one turn), the FIRST hook's execSync threw ETIMEDOUT after ~15ms — not the 15s timeout — so that file silently got no diff or neo-tree marker. Confirmed on a real Windows box: across several 5-file runs the first file always failed fast (ETIMEDOUT/SIGTERM, attempt 1, ~12-19ms) while the rest succeeded. Root cause is a Node spawnSync behaviour: its timeout deadline is derived from libuv's *cached* loop time, refreshed once per loop iteration. The first spawnSync right after async work (the awaited enqueueHook) sees a stale "now", so `now + timeout` is already in the past and libuv SIGTERMs the child the instant it spawns. Unix execs the .sh directly and doesn't hit this the same way (it's why the bug is Windows-only). Fix: make runHook async and retry a *spurious* ETIMEDOUT (ETIMEDOUT that returns faster than SPURIOUS_TIMEOUT_MS), but `await` a turn of the event loop first so libuv refreshes its cached time — a synchronous retry would re-read the same stale value (which is exactly why the *next* hook in a burst always succeeds). A genuine ~15s timeout is not retried, so hang-protection is preserved. enqueueHook now awaits the async runHook, so ordering is unchanged. Validated: retry fired 7x across the runs, all recovered on attempt 1; every 5-file burst now delivers all 5 pre/post hooks to the shim. Also documents that OpenCode's after-hook carries tool args on `input` (the before-hook on `output`) — confirmed against the live API. Co-Authored-By: Claude Opus 4.8 --- backends/opencode/index.ts | 69 ++++++++++++++++++++++++++++---------- 1 file changed, 51 insertions(+), 18 deletions(-) diff --git a/backends/opencode/index.ts b/backends/opencode/index.ts index d70f65b..86749c8 100644 --- a/backends/opencode/index.ts +++ b/backends/opencode/index.ts @@ -56,7 +56,25 @@ const PREVIEW_TOOLS = new Set(["edit", "write", "multiedit", "bash", "apply_patc // ── Shim invocation ────────────────────────────────────────────── -function runHook(event: "pre" | "post", payload: object): void { +const HOOK_TIMEOUT_MS = 15000 +// A genuine timeout takes ~HOOK_TIMEOUT_MS; the spurious libuv timeout below +// returns almost instantly. Anything faster than this is treated as spurious. +const SPURIOUS_TIMEOUT_MS = 2000 +const MAX_HOOK_ATTEMPTS = 3 + +// Node's spawnSync (used by execSync) derives its timeout deadline from libuv's +// *cached* loop time, which is only refreshed once per loop iteration. The first +// spawnSync that runs right after async work — here, the awaited enqueueHook in +// the tool hooks — sees a stale "now", so `now + timeout` is already in the past +// and libuv SIGTERMs the child the instant it spawns: a spurious ETIMEDOUT that +// returns in ~15ms instead of 15s. On Windows this drops the FIRST hook of a +// concurrent burst, so that file gets no diff/marker (issue #46; Unix is +// unaffected — it execs the .sh directly and doesn't hit this path the same way). +// +// The fix: retry, but first `await` a turn of the event loop so libuv refreshes +// its cached time — a synchronous retry would re-read the same stale value and +// fail again (which is exactly why the *next* hook in the burst always succeeds). +async function runHook(event: "pre" | "post", payload: object): Promise { const shim = resolveHookEntry() if (!shim) { // Surface enough breadcrumb that a misconfigured bin-path.txt isn't a @@ -69,20 +87,33 @@ function runHook(event: "pre" | "post", payload: object): void { const cmd = IS_WIN ? `powershell -NoProfile -ExecutionPolicy Bypass -File "${shim}" opencode ${event}` : `"${shim}" opencode ${event}` - try { - execSync(cmd, { - input: JSON.stringify(payload), - env: { ...process.env, CODE_PREVIEW_BACKEND: "opencode" }, - timeout: 15000, - stdio: ["pipe", "pipe", "pipe"], - }) - } catch (err: any) { - // Abstain on any failure. Log timeouts at debug-equivalent because - // a silent 15s hang is otherwise hard to diagnose; everything else - // is treated as best-effort and swallowed. - if (err && (err.code === "ETIMEDOUT" || err.signal === "SIGTERM")) { - // eslint-disable-next-line no-console - console.debug(`[code-preview] hook-entry ${event} timed out after 15s`) + + for (let attempt = 1; attempt <= MAX_HOOK_ATTEMPTS; attempt++) { + const start = Date.now() + try { + execSync(cmd, { + input: JSON.stringify(payload), + env: { ...process.env, CODE_PREVIEW_BACKEND: "opencode" }, + timeout: HOOK_TIMEOUT_MS, + stdio: ["pipe", "pipe", "pipe"], + }) + return + } catch (err: any) { + const elapsed = Date.now() - start + const spurious = + !!err && err.code === "ETIMEDOUT" && elapsed < SPURIOUS_TIMEOUT_MS + if (spurious && attempt < MAX_HOOK_ATTEMPTS) { + // Yield so libuv refreshes its cached loop time before retrying. + await new Promise((resolve) => setImmediate(resolve)) + continue + } + // Abstain on any failure. Log a genuine timeout (a real ~15s hang) since + // it's otherwise hard to diagnose; everything else is best-effort. + if (err && (err.code === "ETIMEDOUT" || err.signal === "SIGTERM")) { + // eslint-disable-next-line no-console + console.debug(`[code-preview] hook-entry ${event} timed out`) + } + return } } } @@ -96,9 +127,9 @@ function runHook(event: "pre" | "post", payload: object): void { let hookQueue: Promise = Promise.resolve() -function enqueueHook(fn: () => void): Promise { - hookQueue = hookQueue.then(() => { - try { fn() } catch { /* non-fatal */ } +function enqueueHook(fn: () => void | Promise): Promise { + hookQueue = hookQueue.then(async () => { + try { await fn() } catch { /* non-fatal */ } }) return hookQueue } @@ -116,6 +147,8 @@ const plugin: Plugin = async ({ directory }) => { "tool.execute.after": async (input, _output) => { if (!PREVIEW_TOOLS.has(input.tool)) return + // OpenCode's after-hook carries the tool args on `input` (the before-hook + // carries them on `output`), confirmed on the live API (issue #46). const args = ((input as any).args as Record) ?? {} const payload = { tool: input.tool, args, cwd: directory } await enqueueHook(() => runHook("post", payload)) From f1d35ab8c8ad9a50753553929590e79f6ba97b89 Mon Sep 17 00:00:00 2001 From: Jay Shitre Date: Thu, 11 Jun 2026 22:59:24 +0530 Subject: [PATCH 2/3] fix(windows): match SIGTERM as well as ETIMEDOUT for the spurious-timeout retry (#46) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Review follow-up. The retry keyed only on `err.code === "ETIMEDOUT"`, but Node's timeout-kill can surface as `signal: 'SIGTERM'` with a null code on some platforms — there the spurious-timeout would be missed and the first hook of a concurrent burst would silently drop again. The logging branch already checked both, so the retry detection was narrower than the logging for no reason. Unify on a single `timedOut` predicate (code ETIMEDOUT OR signal SIGTERM), still gated on `elapsed < SPURIOUS_TIMEOUT_MS` so a genuine ~15s hang is never retried. Also restore elapsed ms in the timeout log so a real hang is distinguishable from exhausted spurious retries. Co-Authored-By: Claude Opus 4.8 --- backends/opencode/index.ts | 19 ++++++++++++------- 1 file changed, 12 insertions(+), 7 deletions(-) diff --git a/backends/opencode/index.ts b/backends/opencode/index.ts index 86749c8..eac928d 100644 --- a/backends/opencode/index.ts +++ b/backends/opencode/index.ts @@ -100,18 +100,23 @@ async function runHook(event: "pre" | "post", payload: object): Promise { return } catch (err: any) { const elapsed = Date.now() - start - const spurious = - !!err && err.code === "ETIMEDOUT" && elapsed < SPURIOUS_TIMEOUT_MS - if (spurious && attempt < MAX_HOOK_ATTEMPTS) { + // A timeout-kill surfaces as code ETIMEDOUT and/or signal SIGTERM — Node + // has historically reported one or the other depending on platform — so + // match both, or the spurious-timeout retry could be missed where only + // SIGTERM is set. Still gated on elapsed < SPURIOUS_TIMEOUT_MS below, so a + // genuine ~15s hang (also SIGTERM'd) is never mistaken for spurious. + const timedOut = + !!err && (err.code === "ETIMEDOUT" || err.signal === "SIGTERM") + if (timedOut && elapsed < SPURIOUS_TIMEOUT_MS && attempt < MAX_HOOK_ATTEMPTS) { // Yield so libuv refreshes its cached loop time before retrying. await new Promise((resolve) => setImmediate(resolve)) continue } - // Abstain on any failure. Log a genuine timeout (a real ~15s hang) since - // it's otherwise hard to diagnose; everything else is best-effort. - if (err && (err.code === "ETIMEDOUT" || err.signal === "SIGTERM")) { + // Abstain on any failure. Log a genuine timeout with elapsed ms so a real + // hang (~15s) is distinguishable from exhausted spurious retries. + if (timedOut) { // eslint-disable-next-line no-console - console.debug(`[code-preview] hook-entry ${event} timed out`) + console.debug(`[code-preview] hook-entry ${event} timed out after ${elapsed}ms`) } return } From bb2428bcac59bee2aa0543eacab6ad57bbe6a4bb Mon Sep 17 00:00:00 2001 From: Jay Shitre Date: Thu, 11 Jun 2026 23:07:20 +0530 Subject: [PATCH 3/3] test(opencode): regression guard for the spurious-timeout retry (#46) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Review follow-up (optional hardening). The Windows libuv quirk that makes the first post-async spawnSync spuriously time out can't be reproduced on CI, so the retry was previously only covered by manual Windows validation — fragile against a later "simplification". Extract the retry loop into an exported, injectable `runWithSpuriousRetry(run, label)` (production behaviour unchanged — runHook just passes the execSync call as `run`), and add a unit guard that drives it with a fake `run`: - fast ETIMEDOUT recovers on retry (the core case) - fast SIGTERM-only recovers (guards the platform-variance fix) - a non-timeout error is NOT retried - a persistent fast timeout is bounded at MAX_HOOK_ATTEMPTS (no infinite loop) - success on first attempt runs once retry_test.ts imports the real index.ts and asserts call counts; test_retry.sh drives it via bun/npx tsx (no nvim needed) and skips cleanly when neither is present. Uses pathToFileURL so the dynamic import also works on Windows. Co-Authored-By: Claude Opus 4.8 --- backends/opencode/index.ts | 77 +++++++++++++--------- tests/backends/opencode/retry_test.ts | 92 +++++++++++++++++++++++++++ tests/backends/opencode/test_retry.sh | 36 +++++++++++ 3 files changed, 175 insertions(+), 30 deletions(-) create mode 100644 tests/backends/opencode/retry_test.ts create mode 100644 tests/backends/opencode/test_retry.sh diff --git a/backends/opencode/index.ts b/backends/opencode/index.ts index eac928d..184278c 100644 --- a/backends/opencode/index.ts +++ b/backends/opencode/index.ts @@ -62,41 +62,34 @@ const HOOK_TIMEOUT_MS = 15000 const SPURIOUS_TIMEOUT_MS = 2000 const MAX_HOOK_ATTEMPTS = 3 -// Node's spawnSync (used by execSync) derives its timeout deadline from libuv's -// *cached* loop time, which is only refreshed once per loop iteration. The first -// spawnSync that runs right after async work — here, the awaited enqueueHook in -// the tool hooks — sees a stale "now", so `now + timeout` is already in the past -// and libuv SIGTERMs the child the instant it spawns: a spurious ETIMEDOUT that -// returns in ~15ms instead of 15s. On Windows this drops the FIRST hook of a -// concurrent burst, so that file gets no diff/marker (issue #46; Unix is -// unaffected — it execs the .sh directly and doesn't hit this path the same way). +// Run `run` (a synchronous, throwing operation — here a spawnSync), retrying a +// SPURIOUS timeout. +// +// Why this exists: Node's spawnSync (used by execSync) derives its timeout +// deadline from libuv's *cached* loop time, which is only refreshed once per +// loop iteration. The first spawnSync that runs right after async work — here, +// the awaited enqueueHook in the tool hooks — sees a stale "now", so +// `now + timeout` is already in the past and libuv SIGTERMs the child the +// instant it spawns: a spurious ETIMEDOUT that returns in ~15ms instead of 15s. +// On Windows this drops the FIRST hook of a concurrent burst, so that file gets +// no diff/marker (issue #46; Unix is unaffected — it execs the .sh directly and +// doesn't hit this the same way). // // The fix: retry, but first `await` a turn of the event loop so libuv refreshes // its cached time — a synchronous retry would re-read the same stale value and -// fail again (which is exactly why the *next* hook in the burst always succeeds). -async function runHook(event: "pre" | "post", payload: object): Promise { - const shim = resolveHookEntry() - if (!shim) { - // Surface enough breadcrumb that a misconfigured bin-path.txt isn't a - // silently-broken plugin. - // eslint-disable-next-line no-console - console.debug(`[code-preview] could not resolve hook-entry shim`) - return - } - // On Windows the .ps1 runs through PowerShell; on Unix the .sh runs directly. - const cmd = IS_WIN - ? `powershell -NoProfile -ExecutionPolicy Bypass -File "${shim}" opencode ${event}` - : `"${shim}" opencode ${event}` - +// fail again (which is exactly why the *next* hook in a burst always succeeds). +// A genuine timeout takes ~HOOK_TIMEOUT_MS, far above SPURIOUS_TIMEOUT_MS, so it +// is never retried. `label` is used only for the diagnostic log. Exported so the +// retry behaviour can be exercised by the test harness (it's a Windows libuv +// quirk that can't otherwise be reproduced on CI). +export async function runWithSpuriousRetry( + run: () => void, + label = "hook-entry", +): Promise { for (let attempt = 1; attempt <= MAX_HOOK_ATTEMPTS; attempt++) { const start = Date.now() try { - execSync(cmd, { - input: JSON.stringify(payload), - env: { ...process.env, CODE_PREVIEW_BACKEND: "opencode" }, - timeout: HOOK_TIMEOUT_MS, - stdio: ["pipe", "pipe", "pipe"], - }) + run() return } catch (err: any) { const elapsed = Date.now() - start @@ -116,13 +109,37 @@ async function runHook(event: "pre" | "post", payload: object): Promise { // hang (~15s) is distinguishable from exhausted spurious retries. if (timedOut) { // eslint-disable-next-line no-console - console.debug(`[code-preview] hook-entry ${event} timed out after ${elapsed}ms`) + console.debug(`[code-preview] ${label} timed out after ${elapsed}ms`) } return } } } +async function runHook(event: "pre" | "post", payload: object): Promise { + const shim = resolveHookEntry() + if (!shim) { + // Surface enough breadcrumb that a misconfigured bin-path.txt isn't a + // silently-broken plugin. + // eslint-disable-next-line no-console + console.debug(`[code-preview] could not resolve hook-entry shim`) + return + } + // On Windows the .ps1 runs through PowerShell; on Unix the .sh runs directly. + const cmd = IS_WIN + ? `powershell -NoProfile -ExecutionPolicy Bypass -File "${shim}" opencode ${event}` + : `"${shim}" opencode ${event}` + + await runWithSpuriousRetry(() => { + execSync(cmd, { + input: JSON.stringify(payload), + env: { ...process.env, CODE_PREVIEW_BACKEND: "opencode" }, + timeout: HOOK_TIMEOUT_MS, + stdio: ["pipe", "pipe", "pipe"], + }) + }, `hook-entry ${event}`) +} + // ── Hook serialisation ─────────────────────────────────────────── // TS→nvim send-order preservation: OpenCode fires before(A) and after(B) // concurrently; without this, RPCs can reorder during socket discovery and a diff --git a/tests/backends/opencode/retry_test.ts b/tests/backends/opencode/retry_test.ts new file mode 100644 index 0000000..7bbbf3f --- /dev/null +++ b/tests/backends/opencode/retry_test.ts @@ -0,0 +1,92 @@ +// retry_test.ts — Unit guard for runWithSpuriousRetry (issue #46). +// +// The Windows libuv quirk — the first spawnSync after async work spuriously +// times out because libuv's cached loop time is stale — can't be reproduced on +// CI. So we test the retry logic directly by injecting a `run` that throws a +// fast ETIMEDOUT / SIGTERM. Cheap insurance against someone later "simplifying" +// the retry (or the SIGTERM match) away. No nvim required. +// +// Run via: bun retry_test.ts (or) npx tsx retry_test.ts +// Prints "ALL OK" and exits 0 on success; exits 1 on any failed check. + +import { resolve, dirname } from "path" +import { fileURLToPath, pathToFileURL } from "url" + +const __dirname = dirname(fileURLToPath(import.meta.url)) + +type RunWithSpuriousRetry = (run: () => void, label?: string) => Promise + +// Build an error shaped like Node's timeout-kill: ETIMEDOUT via `code`, or a +// SIGTERM kill via `signal` (platform-dependent — see the retry's `timedOut`). +function timeoutErr(kind: "code" | "signal"): Error { + const e = new Error("simulated timeout") as any + if (kind === "code") e.code = "ETIMEDOUT" + else e.signal = "SIGTERM" + return e +} + +let failures = 0 +function check(name: string, cond: boolean): void { + console.log(`${cond ? "ok " : "FAIL"} - ${name}`) + if (!cond) failures++ +} + +async function main(): Promise { + // pathToFileURL so the dynamic import works with Windows absolute paths too + // (the bare `D:\…` path is rejected by the ESM loader as an unsupported scheme). + const indexPath = resolve(__dirname, "../../../backends/opencode/index.ts") + const mod = await import(pathToFileURL(indexPath).href) + const runWithSpuriousRetry = mod.runWithSpuriousRetry as RunWithSpuriousRetry + + // Success on the first attempt → run called exactly once. + { + let calls = 0 + await runWithSpuriousRetry(() => { calls++ }) + check("success on first attempt runs once", calls === 1) + } + + // The core case: a fast ETIMEDOUT recovers on retry (the libuv quirk). + { + let calls = 0 + await runWithSpuriousRetry(() => { calls++; if (calls === 1) throw timeoutErr("code") }) + check("fast ETIMEDOUT recovers on retry (runs twice)", calls === 2) + } + + // Platform variance: a SIGTERM-only fast kill must also recover (review fix). + { + let calls = 0 + await runWithSpuriousRetry(() => { calls++; if (calls === 1) throw timeoutErr("signal") }) + check("fast SIGTERM recovers on retry (runs twice)", calls === 2) + } + + // A non-timeout error must NOT be retried — abstain after one attempt. + { + let calls = 0 + await runWithSpuriousRetry(() => { + calls++ + const e = new Error("nope") as any + e.code = "ENOENT" + throw e + }) + check("non-timeout error is not retried (runs once)", calls === 1) + } + + // A persistent fast timeout must be bounded at MAX_HOOK_ATTEMPTS — no infinite + // loop if every attempt spuriously times out. + { + let calls = 0 + await runWithSpuriousRetry(() => { calls++; throw timeoutErr("code") }) + check("persistent fast timeout is bounded (runs 3x)", calls === 3) + } + + if (failures > 0) { + console.log(`RETRY GUARD FAILED (${failures})`) + process.exit(1) + } + console.log("ALL OK") +} + +main().catch((err) => { + console.error(err) + process.exit(1) +}) diff --git a/tests/backends/opencode/test_retry.sh b/tests/backends/opencode/test_retry.sh new file mode 100644 index 0000000..253dfcb --- /dev/null +++ b/tests/backends/opencode/test_retry.sh @@ -0,0 +1,36 @@ +#!/usr/bin/env bash +# test_retry.sh — Unit guard for the OpenCode plugin's spurious-timeout retry. +# +# The Windows libuv quirk (the first spawnSync after async work spuriously times +# out; issue #46) can't be reproduced on CI, so this drives retry_test.ts, which +# exercises runWithSpuriousRetry directly with an injected `run` that throws a +# fast ETIMEDOUT/SIGTERM. No nvim/socket needed — it's a pure unit guard against +# the retry being accidentally removed or narrowed. + +# ── Check for tsx/bun ──────────────────────────────────────────── + +_OPENCODE_RUNNER="" +if command -v bun >/dev/null 2>&1; then + _OPENCODE_RUNNER="bun" +elif command -v npx >/dev/null 2>&1; then + _OPENCODE_RUNNER="npx tsx" +else + echo -e "${YELLOW} ⊘ Skipping OpenCode retry guard (neither bun nor npx found)${NC}" + return 0 2>/dev/null || exit 0 +fi + +RETRY_TEST="$SCRIPT_DIR/backends/opencode/retry_test.ts" + +# ── Test: retry helper recovers / is bounded ───────────────────── + +test_opencode_retry_guard() { + local output + output="$($_OPENCODE_RUNNER "$RETRY_TEST" 2>&1)" + if [[ "$output" != *"ALL OK"* ]]; then + echo -e " ${RED}retry guard output:${NC}" >&2 + echo "$output" >&2 + return 1 + fi +} + +run_test "OpenCode retry recovers fast ETIMEDOUT/SIGTERM, bounded, no over-retry" test_opencode_retry_guard