From 52f942740f552bdee0b28bd8b4a927ff067381c6 Mon Sep 17 00:00:00 2001 From: Shulin Gao Date: Mon, 29 Jun 2026 15:37:27 +0200 Subject: [PATCH] fix(kosong): clamp max_tokens to max_output_size for OpenAI-compatible providers Third-party OpenAI-compatible providers (HuggingFace, Ollama, etc.) can have output limits below the generic CHAT_COMPLETIONS_MAX_OUTPUT_TOKENS_CEILING (131072). When max_output_size is explicitly configured, withMaxCompletionTokens now honours it as a hard upper bound instead of applying the generic ceiling, preventing 400 errors from providers whose actual limit is lower. Fixes #1148. --- .changeset/fix-openai-max-output-size.md | 7 +++++ .../src/session/provider-manager.ts | 1 + .../test/agent/config-state.test.ts | 6 ++--- .../kosong/src/providers/openai-legacy.ts | 16 +++++++++++- packages/kosong/test/openai-legacy.test.ts | 26 +++++++++++++++++++ 5 files changed, 52 insertions(+), 4 deletions(-) create mode 100644 .changeset/fix-openai-max-output-size.md diff --git a/.changeset/fix-openai-max-output-size.md b/.changeset/fix-openai-max-output-size.md new file mode 100644 index 000000000..639c0cfc4 --- /dev/null +++ b/.changeset/fix-openai-max-output-size.md @@ -0,0 +1,7 @@ +--- +"@moonshot-ai/kosong": patch +"@moonshot-ai/agent-core": patch +"@moonshot-ai/kimi-code": patch +--- + +Fix `max_tokens` exceeding provider limit for OpenAI-compatible endpoints. When `max_output_size` is configured, it is now used as a hard ceiling for `max_tokens` instead of being overridden by the generic 128k OpenAI ceiling. This prevents 400 errors from third-party providers (HuggingFace, Ollama, etc.) whose actual output limits are below 131072. diff --git a/packages/agent-core/src/session/provider-manager.ts b/packages/agent-core/src/session/provider-manager.ts index 0616f101d..62df8a1eb 100644 --- a/packages/agent-core/src/session/provider-manager.ts +++ b/packages/agent-core/src/session/provider-manager.ts @@ -283,6 +283,7 @@ function toKosongProviderConfig( baseUrl: providerValue(provider.baseUrl, provider.env, 'OPENAI_BASE_URL'), apiKey: providerApiKey(provider), reasoningKey, + ...(maxOutputSize !== undefined ? { maxTokens: maxOutputSize } : {}), ...defaultHeadersField({ ...envCustomHeaders, ...kimiUserAgentHeader(kimiRequestHeaders), diff --git a/packages/agent-core/test/agent/config-state.test.ts b/packages/agent-core/test/agent/config-state.test.ts index f200108a7..ab5492f40 100644 --- a/packages/agent-core/test/agent/config-state.test.ts +++ b/packages/agent-core/test/agent/config-state.test.ts @@ -121,9 +121,9 @@ describe('ConfigState model capabilities', () => { signal: new AbortController().signal, }); - // maxOutputSize (384000) is clamped to the 128k ceiling applied to - // non-Kimi chat-completions providers. - expect(requestMaxTokens).toBe(131072); + // maxOutputSize (384000) is honoured as the hard ceiling for OpenAI-compatible + // providers. The generic 128k ceiling only applies when max_output_size is unset. + expect(requestMaxTokens).toBe(384000); }); it('uses session id as a provider prompt cache hint without storing it on Agent', () => { diff --git a/packages/kosong/src/providers/openai-legacy.ts b/packages/kosong/src/providers/openai-legacy.ts index 6caa55e24..29fb47975 100644 --- a/packages/kosong/src/providers/openai-legacy.ts +++ b/packages/kosong/src/providers/openai-legacy.ts @@ -453,6 +453,7 @@ export class OpenAILegacyChatProvider implements ChatProvider { private _reasoningKey: string | undefined; private _reasoningEffort: string | undefined; private _generationKwargs: OpenAILegacyGenerationKwargs; + private _explicitMaxTokens: boolean; private _toolMessageConversion: ToolMessageConversion; private _client: OpenAI | undefined; private _httpClient: unknown; @@ -475,6 +476,7 @@ export class OpenAILegacyChatProvider implements ChatProvider { ? normalizedReasoningKey : undefined; this._reasoningEffort = undefined; + this._explicitMaxTokens = options.maxTokens !== undefined; this._generationKwargs = options.maxTokens !== undefined ? completionTokenKwargs(this._model, options.maxTokens) : {}; this._toolMessageConversion = options.toolMessageConversion ?? null; @@ -606,7 +608,19 @@ export class OpenAILegacyChatProvider implements ChatProvider { ) { cap = Math.min(cap, options.maxContextTokens - options.usedContextTokens); } - cap = Math.min(cap, CHAT_COMPLETIONS_MAX_OUTPUT_TOKENS_CEILING); + if (this._explicitMaxTokens) { + // When max_output_size is explicitly configured, honour it as a hard upper + // bound. Third-party OpenAI-compatible providers (HuggingFace, Ollama, etc.) + // can have output limits below CHAT_COMPLETIONS_MAX_OUTPUT_TOKENS_CEILING; + // applying the generic ceiling would override the user's intent and cause a 400. + const configuredCap = + this._generationKwargs.max_tokens ?? this._generationKwargs.max_completion_tokens; + if (configuredCap !== undefined) { + cap = Math.min(cap, configuredCap); + } + } else { + cap = Math.min(cap, CHAT_COMPLETIONS_MAX_OUTPUT_TOKENS_CEILING); + } return this.withGenerationKwargs(completionTokenKwargs(this._model, Math.max(1, cap))); } diff --git a/packages/kosong/test/openai-legacy.test.ts b/packages/kosong/test/openai-legacy.test.ts index f02efd048..5b9a872c4 100644 --- a/packages/kosong/test/openai-legacy.test.ts +++ b/packages/kosong/test/openai-legacy.test.ts @@ -655,6 +655,32 @@ describe('OpenAILegacyChatProvider', () => { // 1000000 - 30000 = 970000, clamped to 131072 expect(body['max_tokens']).toBe(131072); }); + + it('withMaxCompletionTokens respects explicit maxTokens as a ceiling for third-party providers', async () => { + // Reproduces issue #1148: a third-party provider (e.g. HuggingFace, Ollama) may + // have an output limit (e.g. 65536) lower than CHAT_COMPLETIONS_MAX_OUTPUT_TOKENS_CEILING + // (131072). When max_output_size is configured, withMaxCompletionTokens must not + // override it with the generic ceiling. + const provider = new OpenAILegacyChatProvider({ + model: 'deepseek-v4-pro', + apiKey: 'test-key', + stream: false, + maxTokens: 65536, + }); + const capped = provider.withMaxCompletionTokens(1_048_576, { + usedContextTokens: 0, + maxContextTokens: 1_048_576, + }); + const history: Message[] = [ + { role: 'user', content: [{ type: 'text', text: 'Hi' }], toolCalls: [] }, + ]; + const body = await captureRequestBody(capped, '', [], history); + + // Expected: 65536 (the explicit maxTokens cap). + // Bug: currently sends 131072 (CHAT_COMPLETIONS_MAX_OUTPUT_TOKENS_CEILING), + // which exceeds the model's actual API limit and causes a 400. + expect(body['max_tokens']).toBe(65536); + }); }); describe('maxTokens option', () => {