diff --git a/rust/crates/api/src/error.rs b/rust/crates/api/src/error.rs index 1dc4fb295f..8b57c47e99 100644 --- a/rust/crates/api/src/error.rs +++ b/rust/crates/api/src/error.rs @@ -20,6 +20,7 @@ const CONTEXT_WINDOW_ERROR_MARKERS: &[&str] = &[ "completion tokens", "prompt tokens", "request is too large", + "no parseable body", ]; #[derive(Debug)] diff --git a/rust/crates/runtime/src/conversation.rs b/rust/crates/runtime/src/conversation.rs index ea50e3bd25..eb6c942cdd 100644 --- a/rust/crates/runtime/src/conversation.rs +++ b/rust/crates/runtime/src/conversation.rs @@ -92,6 +92,28 @@ pub struct RuntimeError { message: String, } +/// Markers that indicate a context window / token limit error when the +/// original ApiError type has been erased into a string message. +/// Mirrors the markers in `api::error::CONTEXT_WINDOW_ERROR_MARKERS` plus +/// the "no parseable body" heuristic from PR #3214 (OpenAI-compat backends +/// return 400 with an un-parseable body when the request exceeds the +/// model's context length). +const RUNTIME_CONTEXT_WINDOW_MARKERS: &[&str] = &[ + "maximum context length", + "context window", + "context length", + "too many tokens", + "prompt is too long", + "input is too long", + "input tokens exceed", + "configured limit", + "messages resulted in", + "completion tokens", + "prompt tokens", + "request is too large", + "no parseable body", +]; + impl RuntimeError { #[must_use] pub fn new(message: impl Into) -> Self { @@ -99,6 +121,21 @@ impl RuntimeError { message: message.into(), } } + + /// Returns `true` when the error message is consistent with a context + /// window / token-limit exceeded error from the upstream provider. + /// + /// This is the runtime-level counterpart of + /// `api::ApiError::is_context_window_failure()`, needed because the + /// API error type is erased into a plain string when it crosses the + /// runtime boundary. + #[must_use] + pub fn is_context_window_failure(&self) -> bool { + let lowered = self.message.to_ascii_lowercase(); + RUNTIME_CONTEXT_WINDOW_MARKERS + .iter() + .any(|marker| lowered.contains(marker)) + } } impl Display for RuntimeError { diff --git a/rust/crates/rusty-claude-cli/src/main.rs b/rust/crates/rusty-claude-cli/src/main.rs index 5febf8417a..931f97fddd 100644 --- a/rust/crates/rusty-claude-cli/src/main.rs +++ b/rust/crates/rusty-claude-cli/src/main.rs @@ -59,7 +59,7 @@ use runtime::{ ConfigSource, ContentBlock, ConversationMessage, ConversationRuntime, McpServer, McpServerManager, McpServerSpec, McpTool, MessageRole, ModelPricing, PermissionMode, PermissionPolicy, ProjectContext, PromptCacheEvent, ResolvedPermissionMode, RuntimeError, - Session, TokenUsage, ToolError, ToolExecutor, UsageTracker, + Session, TokenUsage, ToolError, ToolExecutor, TurnSummary, UsageTracker, }; use serde::Deserialize; use serde_json::{json, Map, Value}; @@ -5684,6 +5684,117 @@ impl LiveCli { Ok(()) } + /// Maximum number of auto-compact-and-retry attempts when a context window + /// error is detected. After this many attempts the error is surfaced to the + /// user unchanged. + const MAX_COMPACT_RETRIES: usize = 3; + + /// When a turn fails with a context-window error, automatically compact the + /// session (removing old messages to free token budget) and retry the same + /// user input. Each retry round preserves fewer recent messages + /// (`preserve_schedule`) to trade conversation continuity for a smaller + /// payload until it fits. + /// + /// Returns `Ok(TurnSummary)` if the retry succeeded after compaction, or + /// `Err(RuntimeError)` if the error was not a context-window error or all + /// retry rounds were exhausted. + fn auto_compact_retry( + &mut self, + runtime: &mut BuiltRuntime, + input: &str, + error: RuntimeError, + ) -> Result { + if !error.is_context_window_failure() { + return Err(error); + } + + // Progressive compaction: each round preserves fewer recent messages + // (4 → 2 → 1 → 0), trading conversation continuity for a smaller + // payload until it fits. + let preserve_schedule: [usize; Self::MAX_COMPACT_RETRIES] = [4, 2, 0]; + + for round in 0..Self::MAX_COMPACT_RETRIES { + let preserve = preserve_schedule[round]; + println!( + " Context limit reached, auto-compacting session... (attempt {}/{})", + round + 1, + Self::MAX_COMPACT_RETRIES + ); + + // Run Trident pipeline then summary-based compaction + let result = runtime::trident::trident_compact_session( + runtime.session(), + CompactionConfig { + preserve_recent_messages: preserve, + max_estimated_tokens: 0, + }, + &runtime::trident::TridentConfig::default(), + ); + let removed = result.removed_message_count; + + if removed == 0 && round > 0 { + // No more messages to compact — further rounds won't help + println!(" No further compaction possible."); + break; + } + + if removed > 0 { + println!( + "{}", + format_compact_report( + removed, + result.compacted_session.messages.len(), + false + ) + ); + } + + // Without this, prepare_turn_runtime() reads from + // self.runtime.session() which still holds the ORIGINAL + // un-compacted session, so every retry round would send the same + // bloated request — compaction was wasted. + *self.runtime.session_mut() = result.compacted_session.clone(); + + // Build a new runtime with the compacted session and retry + let (mut new_runtime, hook_abort_monitor) = + match self.prepare_turn_runtime(true) { + Ok(pair) => pair, + Err(e) => return Err(RuntimeError::new(e.to_string())), + }; + drop(hook_abort_monitor); + + let mut rp = CliPermissionPrompter::new(self.permission_mode); + match new_runtime.run_turn(input, Some(&mut rp)) { + Ok(summary) => { + // Retry succeeded — swap in the compacted runtime + if let Err(e) = self.replace_runtime(new_runtime) { + return Err(RuntimeError::new(e.to_string())); + } + return Ok(summary); + } + Err(retry_error) => { + if retry_error.is_context_window_failure() + && round + 1 < Self::MAX_COMPACT_RETRIES + { + // The compacted session was still too large. + // Shut down the old runtime, adopt the partially + // compacted one, and loop — the next round will + // compact more aggressively. + let _ = runtime.shutdown_plugins(); + *runtime = new_runtime; + continue; + } + + // Not a context window error, or out of rounds + return Err(retry_error); + } + } + } + + // All retries exhausted — propagate the original error + Err(error) + } + fn run_turn(&mut self, input: &str) -> Result<(), Box> { let (mut runtime, hook_abort_monitor) = self.prepare_turn_runtime(true)?; let mut spinner = Spinner::new(); @@ -5726,135 +5837,25 @@ impl LiveCli { &mut stdout, )?; - // ============================================================================ - // Auto-compact retry on context window errors - // ============================================================================ - // When the model API returns a context_window_blocked error (because the request - // exceeds the model's context window), we automatically: - // 1. Compact the session (remove old messages to free up space) - // 2. Retry the original request with the compacted session - // 3. Report results to the user - // - // This eliminates the need for users to manually run /compact when they - // hit context limits - the recovery happens automatically. - // - // Detection: We look for "context_window" or "Context window" in the error - // message, which covers error types like: - // - "context_window_blocked" - // - "Context window blocked" - // - "This model's maximum context length is X tokens..." - // ============================================================================ - - let error_str = error.to_string(); - // Detect context window overflow. Some providers (e.g. OpenAI-compat backends) - // return 400 with "no parseable body" instead of a proper context_length_exceeded - // error when the request is too large to even parse — treat that as context overflow too. - let is_context_window = error_str.contains("context_window") - || error_str.contains("Context window") - || error_str.contains("no parseable body"); - - if is_context_window { - // A single compaction pass may not free enough context space. - // Progressive retry: each round preserves fewer recent messages (4→2→1→0), - // trading conversation continuity for a smaller payload until it fits. - // Max 4 rounds before giving up and surfacing the error to the user. - let max_compact_rounds = 4; - let preserve_schedule = [4, 2, 1, 0]; - - for round in 0..max_compact_rounds { - let preserve = preserve_schedule[round]; - println!( - " Auto-compacting session (round {}/{}, preserving {} recent messages)...", - round + 1, - max_compact_rounds, - preserve - ); - - // Run Trident pipeline then summary-based compaction - let result = runtime::trident::trident_compact_session( - runtime.session(), - CompactionConfig { - preserve_recent_messages: preserve, - max_estimated_tokens: 0, - }, - &runtime::trident::TridentConfig::default(), - ); - let removed = result.removed_message_count; - - if removed == 0 && round > 0 { - // No more messages to compact — further rounds won't help - println!(" No further compaction possible."); - break; - } - - if removed > 0 { + match self.auto_compact_retry(&mut runtime, input, error) { + Ok(summary) => { + spinner.finish( + "✨ Done (after auto-compact)", + TerminalRenderer::new().color_theme(), + &mut stdout, + )?; + println!(); + if let Some(event) = summary.auto_compaction { println!( "{}", - format_compact_report( - removed, - result.compacted_session.messages.len(), - false - ) + format_auto_compaction_notice(event.removed_message_count) ); } - - // Without this, prepare_turn_runtime() reads from self.runtime.session() - // which still holds the ORIGINAL un-compacted session, so every retry round - // would send the same bloated request — compaction was wasted. - *self.runtime.session_mut() = result.compacted_session.clone(); - - // Build a new runtime with the compacted session and retry - let (mut new_runtime, hook_abort_monitor) = - self.prepare_turn_runtime(true)?; - drop(hook_abort_monitor); - - let mut rp = CliPermissionPrompter::new(self.permission_mode); - match new_runtime.run_turn(input, Some(&mut rp)) { - Ok(summary) => { - self.replace_runtime(new_runtime)?; - spinner.finish( - if round == 0 { - "✨ Done (after auto-compact)" - } else { - "✨ Done (after aggressive auto-compact)" - }, - TerminalRenderer::new().color_theme(), - &mut stdout, - )?; - println!(); - if let Some(event) = summary.auto_compaction { - println!( - "{}", - format_auto_compaction_notice(event.removed_message_count) - ); - } - self.persist_session()?; - return Ok(()); - } - Err(retry_error) => { - let retry_str = retry_error.to_string(); - let still_context_window = retry_str.contains("context_window") - || retry_str.contains("Context window") - || retry_str.contains("no parseable body"); - - if still_context_window && round + 1 < max_compact_rounds { - // The compacted session was still too large for the model's context. - // Shut down the old runtime, adopt the partially-compacted one, - // and loop — the next round will compact more aggressively. - runtime.shutdown_plugins()?; - runtime = new_runtime; - continue; - } - - // Not a context window error, or out of rounds - return Err(Box::new(retry_error)); - } - } + self.persist_session()?; + Ok(()) } + Err(final_error) => Err(Box::new(final_error)), } - - // If not a context window error, return original error - Err(Box::new(error)) } } } @@ -5878,7 +5879,13 @@ impl LiveCli { let mut permission_prompter = CliPermissionPrompter::new(self.permission_mode); let result = runtime.run_turn(input, Some(&mut permission_prompter)); hook_abort_monitor.stop(); - let summary = result?; + let summary = match result { + Ok(s) => s, + Err(error) => { + let _ = runtime.shutdown_plugins(); + self.auto_compact_retry(&mut runtime, input, error)? + } + }; self.replace_runtime(runtime)?; self.persist_session()?; let final_text = final_assistant_text(&summary); @@ -5891,7 +5898,13 @@ impl LiveCli { let mut permission_prompter = CliPermissionPrompter::new(self.permission_mode); let result = runtime.run_turn(input, Some(&mut permission_prompter)); hook_abort_monitor.stop(); - let summary = result?; + let summary = match result { + Ok(s) => s, + Err(error) => { + let _ = runtime.shutdown_plugins(); + self.auto_compact_retry(&mut runtime, input, error)? + } + }; self.replace_runtime(runtime)?; self.persist_session()?; println!( @@ -5916,7 +5929,13 @@ impl LiveCli { let mut permission_prompter = CliPermissionPrompter::new(self.permission_mode); let result = runtime.run_turn(input, Some(&mut permission_prompter)); hook_abort_monitor.stop(); - let summary = result?; + let summary = match result { + Ok(s) => s, + Err(error) => { + let _ = runtime.shutdown_plugins(); + self.auto_compact_retry(&mut runtime, input, error)? + } + }; self.replace_runtime(runtime)?; self.persist_session()?; println!(