diff --git a/internal/engine/capacity_pause.go b/internal/engine/capacity_pause.go index 8637db2..6ce7115 100644 --- a/internal/engine/capacity_pause.go +++ b/internal/engine/capacity_pause.go @@ -13,7 +13,7 @@ import ( // capacity / session-limit exhaustion at a given pipeline stage. The "capacity // limit" phrasing routes pauseResumeHint to the correct operator guidance. func capacityPauseReason(stage string, err error) string { - return fmt.Sprintf("LLM capacity/session limit during %s — resume after reset: %v", stage, err) + return fmt.Sprintf("transient LLM capacity/network error during %s — resume after it clears: %v", stage, err) } // pauseIfCapacity inspects an LLM-call error. If it is a transient capacity diff --git a/internal/llm/capacity_test.go b/internal/llm/capacity_test.go index 2408746..6c613c5 100644 --- a/internal/llm/capacity_test.go +++ b/internal/llm/capacity_test.go @@ -26,6 +26,12 @@ func TestIsCapacityError(t *testing.T) { {name: "rate limit string", err: fmt.Errorf("claude CLI error: rate limit exceeded"), expect: true}, {name: "too many requests string", err: fmt.Errorf("too many requests"), expect: true}, {name: "overloaded string", err: fmt.Errorf("the service is currently overloaded"), expect: true}, + // Transient network/transport failures (api_error_status null) — must + // also classify as transient so they take the clean-pause path. + {name: "socket closed string", err: fmt.Errorf(`claude CLI error: exit status 1 (output: {"is_error":true,"api_error_status":null,"result":"API Error: The socket connection was closed unexpectedly"})`), expect: true}, + {name: "connection reset", err: fmt.Errorf("read tcp: connection reset by peer"), expect: true}, + {name: "i/o timeout", err: fmt.Errorf("dial tcp: i/o timeout"), expect: true}, + {name: "503 service unavailable", err: fmt.Errorf("503 service unavailable"), expect: true}, {name: "api_error_status 429 embedded", err: fmt.Errorf(`output: {"api_error_status":429}`), expect: true}, {name: "api_error_status 529 embedded", err: fmt.Errorf(`output: {"api_error_status":529}`), expect: true}, diff --git a/internal/llm/errors.go b/internal/llm/errors.go index 722340e..9439a52 100644 --- a/internal/llm/errors.go +++ b/internal/llm/errors.go @@ -77,6 +77,21 @@ var capacitySignatures = []string{ `"api_error_status":529`, `"api_error_status": 429`, `"api_error_status": 529`, + + // Transient network/transport failures. Like a session limit, these are not + // a story-quality problem and succeed on retry/resume — so they must take + // the clean-pause path, not burn the escalation chain. Surfaced by the CLI + // as e.g. "API Error: The socket connection was closed unexpectedly". + "socket connection was closed", + "connection closed unexpectedly", + "connection reset", + "connection refused", + "i/o timeout", + "tls handshake timeout", + "unexpected eof", + "service unavailable", + "bad gateway", + "gateway timeout", } // ContainsCapacitySignature reports whether a raw string carries a capacity /