Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions rust/crates/api/src/error.rs
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,7 @@ const CONTEXT_WINDOW_ERROR_MARKERS: &[&str] = &[
"completion tokens",
"prompt tokens",
"request is too large",
"no parseable body",
];

#[derive(Debug)]
Expand Down
37 changes: 37 additions & 0 deletions rust/crates/runtime/src/conversation.rs
Original file line number Diff line number Diff line change
Expand Up @@ -92,13 +92,50 @@ pub struct RuntimeError {
message: String,
}

/// Markers that indicate a context window / token limit error when the
/// original ApiError type has been erased into a string message.
/// Mirrors the markers in `api::error::CONTEXT_WINDOW_ERROR_MARKERS` plus
/// the "no parseable body" heuristic from PR #3214 (OpenAI-compat backends
/// return 400 with an un-parseable body when the request exceeds the
/// model's context length).
const RUNTIME_CONTEXT_WINDOW_MARKERS: &[&str] = &[
"maximum context length",
"context window",
"context length",
"too many tokens",
"prompt is too long",
"input is too long",
"input tokens exceed",
"configured limit",
"messages resulted in",
"completion tokens",
"prompt tokens",
"request is too large",
"no parseable body",
];

impl RuntimeError {
#[must_use]
pub fn new(message: impl Into<String>) -> Self {
Self {
message: message.into(),
}
}

/// Returns `true` when the error message is consistent with a context
/// window / token-limit exceeded error from the upstream provider.
///
/// This is the runtime-level counterpart of
/// `api::ApiError::is_context_window_failure()`, needed because the
/// API error type is erased into a plain string when it crosses the
/// runtime boundary.
#[must_use]
pub fn is_context_window_failure(&self) -> bool {
let lowered = self.message.to_ascii_lowercase();
RUNTIME_CONTEXT_WINDOW_MARKERS
.iter()
.any(|marker| lowered.contains(marker))
}
}

impl Display for RuntimeError {
Expand Down
273 changes: 146 additions & 127 deletions rust/crates/rusty-claude-cli/src/main.rs
Original file line number Diff line number Diff line change
Expand Up @@ -59,7 +59,7 @@ use runtime::{
ConfigSource, ContentBlock, ConversationMessage, ConversationRuntime, McpServer,
McpServerManager, McpServerSpec, McpTool, MessageRole, ModelPricing, PermissionMode,
PermissionPolicy, ProjectContext, PromptCacheEvent, ResolvedPermissionMode, RuntimeError,
Session, TokenUsage, ToolError, ToolExecutor, UsageTracker,
Session, TokenUsage, ToolError, ToolExecutor, TurnSummary, UsageTracker,
};
use serde::Deserialize;
use serde_json::{json, Map, Value};
Expand Down Expand Up @@ -5684,6 +5684,117 @@ impl LiveCli {
Ok(())
}

/// Maximum number of auto-compact-and-retry attempts when a context window
/// error is detected. After this many attempts the error is surfaced to the
/// user unchanged.
const MAX_COMPACT_RETRIES: usize = 3;

/// When a turn fails with a context-window error, automatically compact the
/// session (removing old messages to free token budget) and retry the same
/// user input. Each retry round preserves fewer recent messages
/// (`preserve_schedule`) to trade conversation continuity for a smaller
/// payload until it fits.
///
/// Returns `Ok(TurnSummary)` if the retry succeeded after compaction, or
/// `Err(RuntimeError)` if the error was not a context-window error or all
/// retry rounds were exhausted.
fn auto_compact_retry(
&mut self,
runtime: &mut BuiltRuntime,
input: &str,
error: RuntimeError,
) -> Result<TurnSummary, RuntimeError> {
if !error.is_context_window_failure() {
return Err(error);
}

// Progressive compaction: each round preserves fewer recent messages
// (4 → 2 → 1 → 0), trading conversation continuity for a smaller
// payload until it fits.
let preserve_schedule: [usize; Self::MAX_COMPACT_RETRIES] = [4, 2, 0];

for round in 0..Self::MAX_COMPACT_RETRIES {
let preserve = preserve_schedule[round];
println!(
" Context limit reached, auto-compacting session... (attempt {}/{})",
round + 1,
Self::MAX_COMPACT_RETRIES
);

// Run Trident pipeline then summary-based compaction
let result = runtime::trident::trident_compact_session(
runtime.session(),
CompactionConfig {
preserve_recent_messages: preserve,
max_estimated_tokens: 0,
},
&runtime::trident::TridentConfig::default(),
);
let removed = result.removed_message_count;

if removed == 0 && round > 0 {
// No more messages to compact — further rounds won't help
println!(" No further compaction possible.");
break;
}

if removed > 0 {
println!(
"{}",
format_compact_report(
removed,
result.compacted_session.messages.len(),
false
)
);
}

// Without this, prepare_turn_runtime() reads from
// self.runtime.session() which still holds the ORIGINAL
// un-compacted session, so every retry round would send the same
// bloated request — compaction was wasted.
*self.runtime.session_mut() = result.compacted_session.clone();

// Build a new runtime with the compacted session and retry
let (mut new_runtime, hook_abort_monitor) =
match self.prepare_turn_runtime(true) {
Ok(pair) => pair,
Err(e) => return Err(RuntimeError::new(e.to_string())),
};
drop(hook_abort_monitor);

let mut rp = CliPermissionPrompter::new(self.permission_mode);
match new_runtime.run_turn(input, Some(&mut rp)) {
Ok(summary) => {
// Retry succeeded — swap in the compacted runtime
if let Err(e) = self.replace_runtime(new_runtime) {
return Err(RuntimeError::new(e.to_string()));
}
return Ok(summary);
}
Err(retry_error) => {
if retry_error.is_context_window_failure()
&& round + 1 < Self::MAX_COMPACT_RETRIES
{
// The compacted session was still too large.
// Shut down the old runtime, adopt the partially
// compacted one, and loop — the next round will
// compact more aggressively.
let _ = runtime.shutdown_plugins();
*runtime = new_runtime;
continue;
}

// Not a context window error, or out of rounds
return Err(retry_error);
}
}
}

// All retries exhausted — propagate the original error
Err(error)
}

fn run_turn(&mut self, input: &str) -> Result<(), Box<dyn std::error::Error>> {
let (mut runtime, hook_abort_monitor) = self.prepare_turn_runtime(true)?;
let mut spinner = Spinner::new();
Expand Down Expand Up @@ -5726,135 +5837,25 @@ impl LiveCli {
&mut stdout,
)?;

// ============================================================================
// Auto-compact retry on context window errors
// ============================================================================
// When the model API returns a context_window_blocked error (because the request
// exceeds the model's context window), we automatically:
// 1. Compact the session (remove old messages to free up space)
// 2. Retry the original request with the compacted session
// 3. Report results to the user
//
// This eliminates the need for users to manually run /compact when they
// hit context limits - the recovery happens automatically.
//
// Detection: We look for "context_window" or "Context window" in the error
// message, which covers error types like:
// - "context_window_blocked"
// - "Context window blocked"
// - "This model's maximum context length is X tokens..."
// ============================================================================

let error_str = error.to_string();
// Detect context window overflow. Some providers (e.g. OpenAI-compat backends)
// return 400 with "no parseable body" instead of a proper context_length_exceeded
// error when the request is too large to even parse — treat that as context overflow too.
let is_context_window = error_str.contains("context_window")
|| error_str.contains("Context window")
|| error_str.contains("no parseable body");

if is_context_window {
// A single compaction pass may not free enough context space.
// Progressive retry: each round preserves fewer recent messages (4→2→1→0),
// trading conversation continuity for a smaller payload until it fits.
// Max 4 rounds before giving up and surfacing the error to the user.
let max_compact_rounds = 4;
let preserve_schedule = [4, 2, 1, 0];

for round in 0..max_compact_rounds {
let preserve = preserve_schedule[round];
println!(
" Auto-compacting session (round {}/{}, preserving {} recent messages)...",
round + 1,
max_compact_rounds,
preserve
);

// Run Trident pipeline then summary-based compaction
let result = runtime::trident::trident_compact_session(
runtime.session(),
CompactionConfig {
preserve_recent_messages: preserve,
max_estimated_tokens: 0,
},
&runtime::trident::TridentConfig::default(),
);
let removed = result.removed_message_count;

if removed == 0 && round > 0 {
// No more messages to compact — further rounds won't help
println!(" No further compaction possible.");
break;
}

if removed > 0 {
match self.auto_compact_retry(&mut runtime, input, error) {
Ok(summary) => {
spinner.finish(
"✨ Done (after auto-compact)",
TerminalRenderer::new().color_theme(),
&mut stdout,
)?;
println!();
if let Some(event) = summary.auto_compaction {
println!(
"{}",
format_compact_report(
removed,
result.compacted_session.messages.len(),
false
)
format_auto_compaction_notice(event.removed_message_count)
);
}

// Without this, prepare_turn_runtime() reads from self.runtime.session()
// which still holds the ORIGINAL un-compacted session, so every retry round
// would send the same bloated request — compaction was wasted.
*self.runtime.session_mut() = result.compacted_session.clone();

// Build a new runtime with the compacted session and retry
let (mut new_runtime, hook_abort_monitor) =
self.prepare_turn_runtime(true)?;
drop(hook_abort_monitor);

let mut rp = CliPermissionPrompter::new(self.permission_mode);
match new_runtime.run_turn(input, Some(&mut rp)) {
Ok(summary) => {
self.replace_runtime(new_runtime)?;
spinner.finish(
if round == 0 {
"✨ Done (after auto-compact)"
} else {
"✨ Done (after aggressive auto-compact)"
},
TerminalRenderer::new().color_theme(),
&mut stdout,
)?;
println!();
if let Some(event) = summary.auto_compaction {
println!(
"{}",
format_auto_compaction_notice(event.removed_message_count)
);
}
self.persist_session()?;
return Ok(());
}
Err(retry_error) => {
let retry_str = retry_error.to_string();
let still_context_window = retry_str.contains("context_window")
|| retry_str.contains("Context window")
|| retry_str.contains("no parseable body");

if still_context_window && round + 1 < max_compact_rounds {
// The compacted session was still too large for the model's context.
// Shut down the old runtime, adopt the partially-compacted one,
// and loop — the next round will compact more aggressively.
runtime.shutdown_plugins()?;
runtime = new_runtime;
continue;
}

// Not a context window error, or out of rounds
return Err(Box::new(retry_error));
}
}
self.persist_session()?;
Ok(())
}
Err(final_error) => Err(Box::new(final_error)),
}

// If not a context window error, return original error
Err(Box::new(error))
}
}
}
Expand All @@ -5878,7 +5879,13 @@ impl LiveCli {
let mut permission_prompter = CliPermissionPrompter::new(self.permission_mode);
let result = runtime.run_turn(input, Some(&mut permission_prompter));
hook_abort_monitor.stop();
let summary = result?;
let summary = match result {
Ok(s) => s,
Err(error) => {
let _ = runtime.shutdown_plugins();
self.auto_compact_retry(&mut runtime, input, error)?
}
};
self.replace_runtime(runtime)?;
self.persist_session()?;
let final_text = final_assistant_text(&summary);
Expand All @@ -5891,7 +5898,13 @@ impl LiveCli {
let mut permission_prompter = CliPermissionPrompter::new(self.permission_mode);
let result = runtime.run_turn(input, Some(&mut permission_prompter));
hook_abort_monitor.stop();
let summary = result?;
let summary = match result {
Ok(s) => s,
Err(error) => {
let _ = runtime.shutdown_plugins();
self.auto_compact_retry(&mut runtime, input, error)?
}
};
self.replace_runtime(runtime)?;
self.persist_session()?;
println!(
Expand All @@ -5916,7 +5929,13 @@ impl LiveCli {
let mut permission_prompter = CliPermissionPrompter::new(self.permission_mode);
let result = runtime.run_turn(input, Some(&mut permission_prompter));
hook_abort_monitor.stop();
let summary = result?;
let summary = match result {
Ok(s) => s,
Err(error) => {
let _ = runtime.shutdown_plugins();
self.auto_compact_retry(&mut runtime, input, error)?
}
};
self.replace_runtime(runtime)?;
self.persist_session()?;
println!(
Expand Down
Loading