From 96771a83427bcbd1fa87bbc010a031b51cb16604 Mon Sep 17 00:00:00 2001 From: arin-deloatch Date: Thu, 25 Jun 2026 10:57:02 -0700 Subject: [PATCH 1/4] LCORE-1613: Add max_infer_iters and max_tool_calls to InferenceConfiguration --- src/models/config.py | 20 ++++++ .../config/test_inference_configuration.py | 68 +++++++++++++++++++ 2 files changed, 88 insertions(+) diff --git a/src/models/config.py b/src/models/config.py index fdd6112f4..5462a4680 100644 --- a/src/models/config.py +++ b/src/models/config.py @@ -1727,6 +1727,26 @@ class InferenceConfiguration(ConfigurationBase): "meaning and are independent of this list.", ) + max_infer_iters: Optional[PositiveInt] = Field( + default=10, + title="Default max inference iterations", + description="Server-side default for the maximum number of inference " + "iterations a model can perform in a single request. Prevents small " + "models from looping indefinitely on tool calls. " + "Per-request values take precedence over this default. " + "Set to None to disable the limit.", + ) + + max_tool_calls: Optional[PositiveInt] = Field( + default=30, + title="Default max tool calls", + description="Server-side default for the maximum number of tool calls " + "allowed in a single response. Prevents small models from exhausting " + "the context window with repeated tool calls. " + "Per-request values take precedence over this default. " + "Set to None to disable the limit.", + ) + @model_validator(mode="after") def check_default_model_and_provider(self) -> Self: """ diff --git a/tests/unit/models/config/test_inference_configuration.py b/tests/unit/models/config/test_inference_configuration.py index 8465e2f8b..a9cfc4cc5 100644 --- a/tests/unit/models/config/test_inference_configuration.py +++ b/tests/unit/models/config/test_inference_configuration.py @@ -90,3 +90,71 @@ def test_context_windows_rejects_negative_size() -> None: InferenceConfiguration( context_windows={"openai/gpt-4o-mini": -1}, ) # pyright: ignore[reportCallIssue] + + +def test_max_infer_iters_default() -> None: + """Test that max_infer_iters defaults to 10.""" + config = InferenceConfiguration() # pyright: ignore[reportCallIssue] + assert config.max_infer_iters == 10 + + +def test_max_tool_calls_default() -> None: + """Test that max_tool_calls defaults to 30.""" + config = InferenceConfiguration() # pyright: ignore[reportCallIssue] + assert config.max_tool_calls == 30 + + +def test_max_infer_iters_accepts_positive_int() -> None: + """Test that max_infer_iters accepts a positive integer.""" + config = InferenceConfiguration( + max_infer_iters=5 + ) # pyright: ignore[reportCallIssue] + assert config.max_infer_iters == 5 + + +def test_max_tool_calls_accepts_positive_int() -> None: + """Test that max_tool_calls accepts a positive integer.""" + config = InferenceConfiguration( + max_tool_calls=20 + ) # pyright: ignore[reportCallIssue] + assert config.max_tool_calls == 20 + + +def test_max_infer_iters_rejects_zero() -> None: + """Test that max_infer_iters rejects zero.""" + with pytest.raises(ValueError): + InferenceConfiguration(max_infer_iters=0) # pyright: ignore[reportCallIssue] + + +def test_max_infer_iters_rejects_negative() -> None: + """Test that max_infer_iters rejects a negative value.""" + with pytest.raises(ValueError): + InferenceConfiguration(max_infer_iters=-1) # pyright: ignore[reportCallIssue] + + +def test_max_tool_calls_rejects_zero() -> None: + """Test that max_tool_calls rejects zero.""" + with pytest.raises(ValueError): + InferenceConfiguration(max_tool_calls=0) # pyright: ignore[reportCallIssue] + + +def test_max_tool_calls_rejects_negative() -> None: + """Test that max_tool_calls rejects a negative value.""" + with pytest.raises(ValueError): + InferenceConfiguration(max_tool_calls=-1) # pyright: ignore[reportCallIssue] + + +def test_max_infer_iters_accepts_none() -> None: + """Test that max_infer_iters accepts None to disable the limit.""" + config = InferenceConfiguration( + max_infer_iters=None + ) # pyright: ignore[reportCallIssue] + assert config.max_infer_iters is None + + +def test_max_tool_calls_accepts_none() -> None: + """Test that max_tool_calls accepts None to disable the limit.""" + config = InferenceConfiguration( + max_tool_calls=None + ) # pyright: ignore[reportCallIssue] + assert config.max_tool_calls is None From 2842b6b340c649aae69607eaa3e0b7817e94624b Mon Sep 17 00:00:00 2001 From: arin-deloatch Date: Thu, 25 Jun 2026 10:57:45 -0700 Subject: [PATCH 2/4] LCORE-1613: Apply inference defaults in query and responses endpoints --- src/app/endpoints/responses.py | 5 +++++ src/utils/responses.py | 2 ++ 2 files changed, 7 insertions(+) diff --git a/src/app/endpoints/responses.py b/src/app/endpoints/responses.py index 6d0cd9b05..286c95d74 100644 --- a/src/app/endpoints/responses.py +++ b/src/app/endpoints/responses.py @@ -466,6 +466,11 @@ async def responses_endpoint_handler( api_params = ResponsesApiParams.model_validate(updated_request.model_dump()) + if "max_infer_iters" not in original_request.model_fields_set: + api_params.max_infer_iters = configuration.inference.max_infer_iters + if "max_tool_calls" not in original_request.model_fields_set: + api_params.max_tool_calls = configuration.inference.max_tool_calls + # Compact the conversation if it is approaching the context window limit. # /v1/responses is OpenAI-compatible, so compaction is silent (no custom SSE # event): summarization happens before the response is created, and the turn diff --git a/src/utils/responses.py b/src/utils/responses.py index 3141742d8..5e5916e0b 100644 --- a/src/utils/responses.py +++ b/src/utils/responses.py @@ -429,6 +429,8 @@ async def prepare_responses_params( # pylint: disable=too-many-arguments,too-ma stream=stream, store=store, extra_headers=extra_headers, + max_infer_iters=configuration.inference.max_infer_iters, + max_tool_calls=configuration.inference.max_tool_calls, ) From 59c9c0a0d7896a30c019a05c092383da16b23fda Mon Sep 17 00:00:00 2001 From: arin-deloatch Date: Thu, 25 Jun 2026 10:58:48 -0700 Subject: [PATCH 3/4] LCORE-1613: Fix test mocks broken by inference defaults --- tests/unit/utils/test_responses.py | 23 ++++++++++++++--------- 1 file changed, 14 insertions(+), 9 deletions(-) diff --git a/tests/unit/utils/test_responses.py b/tests/unit/utils/test_responses.py index 530502869..8863ddd7c 100644 --- a/tests/unit/utils/test_responses.py +++ b/tests/unit/utils/test_responses.py @@ -56,7 +56,12 @@ import constants from models.api.requests import QueryRequest from models.common.responses.types import InputTool, InputToolMCP -from models.config import ApprovalFilter, ByokRag, ModelContextProtocolServer +from models.config import ( + ApprovalFilter, + ByokRag, + InferenceConfiguration, + ModelContextProtocolServer, +) from utils.query import normalize_vertex_ai_model_id from utils.responses import ( _build_chunk_attributes, @@ -1976,7 +1981,7 @@ async def test_prepare_responses_params_with_conversation_id( ) # pyright: ignore[reportCallIssue] mock_config = mocker.Mock() - mock_config.inference = None + mock_config.inference = InferenceConfiguration() mocker.patch("utils.responses.configuration", mock_config) mocker.patch("utils.responses.get_system_prompt", return_value="System prompt") mocker.patch("utils.responses.prepare_tools", return_value=None) @@ -2012,7 +2017,7 @@ async def test_prepare_responses_params_create_conversation( query_request = QueryRequest(query="test") # pyright: ignore[reportCallIssue] mock_config = mocker.Mock() - mock_config.inference = None + mock_config.inference = InferenceConfiguration() mocker.patch("utils.responses.configuration", mock_config) mocker.patch("utils.responses.get_system_prompt", return_value="System prompt") mocker.patch("utils.responses.prepare_tools", return_value=None) @@ -2038,7 +2043,7 @@ async def test_prepare_responses_params_connection_error_on_models( query_request = QueryRequest(query="test") # pyright: ignore[reportCallIssue] mock_config = mocker.Mock() - mock_config.inference = None + mock_config.inference = InferenceConfiguration() mocker.patch("utils.responses.configuration", mock_config) with pytest.raises(HTTPException) as exc_info: @@ -2064,7 +2069,7 @@ async def test_prepare_responses_params_connection_error_on_conversation( query_request = QueryRequest(query="test") # pyright: ignore[reportCallIssue] mock_config = mocker.Mock() - mock_config.inference = None + mock_config.inference = InferenceConfiguration() mocker.patch("utils.responses.configuration", mock_config) mocker.patch("utils.responses.get_system_prompt", return_value="System prompt") mocker.patch("utils.responses.prepare_tools", return_value=None) @@ -2088,7 +2093,7 @@ async def test_prepare_responses_params_api_status_error_on_models( query_request = QueryRequest(query="test") # pyright: ignore[reportCallIssue] mock_config = mocker.Mock() - mock_config.inference = None + mock_config.inference = InferenceConfiguration() mocker.patch("utils.responses.configuration", mock_config) with pytest.raises(HTTPException) as exc_info: @@ -2131,7 +2136,7 @@ async def test_prepare_responses_params_includes_mcp_provider_data_headers( ] mock_config = mocker.Mock() - mock_config.inference = None + mock_config.inference = InferenceConfiguration() mocker.patch("utils.responses.configuration", mock_config) mocker.patch("utils.responses.get_system_prompt", return_value="System prompt") mocker.patch( @@ -2179,7 +2184,7 @@ async def test_prepare_responses_params_no_extra_headers_without_mcp_tools( query_request = QueryRequest(query="test") # pyright: ignore[reportCallIssue] mock_config = mocker.Mock() - mock_config.inference = None + mock_config.inference = InferenceConfiguration() mocker.patch("utils.responses.configuration", mock_config) mocker.patch("utils.responses.get_system_prompt", return_value="System prompt") mocker.patch("utils.responses.prepare_tools", return_value=None) @@ -2211,7 +2216,7 @@ async def test_prepare_responses_params_api_status_error_on_conversation( query_request = QueryRequest(query="test") # pyright: ignore[reportCallIssue] mock_config = mocker.Mock() - mock_config.inference = None + mock_config.inference = InferenceConfiguration() mocker.patch("utils.responses.configuration", mock_config) mocker.patch("utils.responses.get_system_prompt", return_value="System prompt") mocker.patch("utils.responses.prepare_tools", return_value=None) From de9d863d5f59cd566104f6911ba24883aa0d1ed7 Mon Sep 17 00:00:00 2001 From: arin-deloatch Date: Fri, 26 Jun 2026 10:55:28 -0700 Subject: [PATCH 4/4] LCORE-1613: generate OpenAPI spec; address code pattern nit --- docs/openapi.json | 28 ++++++++++++++++++++++++++++ src/app/endpoints/responses.py | 8 ++++---- 2 files changed, 32 insertions(+), 4 deletions(-) diff --git a/docs/openapi.json b/docs/openapi.json index 7f9a26513..f04cd0382 100644 --- a/docs/openapi.json +++ b/docs/openapi.json @@ -13614,6 +13614,34 @@ "type": "array", "title": "High-level inference providers", "description": "Unified-mode synthesis input (Decision S5): a high-level, backend-agnostic list of inference providers the synthesizer expands into Llama Stack provider entries. Lives at the configuration root so it survives a future backend change. A non-empty list signals unified mode. Empty (the default) leaves legacy/remote modes unaffected. The sibling default_model / default_provider keep their query-time routing meaning and are independent of this list." + }, + "max_infer_iters": { + "anyOf": [ + { + "type": "integer", + "exclusiveMinimum": 0.0 + }, + { + "type": "null" + } + ], + "title": "Default max inference iterations", + "description": "Server-side default for the maximum number of inference iterations a model can perform in a single request. Prevents small models from looping indefinitely on tool calls. Per-request values take precedence over this default. Set to None to disable the limit.", + "default": 10 + }, + "max_tool_calls": { + "anyOf": [ + { + "type": "integer", + "exclusiveMinimum": 0.0 + }, + { + "type": "null" + } + ], + "title": "Default max tool calls", + "description": "Server-side default for the maximum number of tool calls allowed in a single response. Prevents small models from exhausting the context window with repeated tool calls. Per-request values take precedence over this default. Set to None to disable the limit.", + "default": 30 } }, "additionalProperties": false, diff --git a/src/app/endpoints/responses.py b/src/app/endpoints/responses.py index 286c95d74..6d77a5799 100644 --- a/src/app/endpoints/responses.py +++ b/src/app/endpoints/responses.py @@ -464,12 +464,12 @@ async def responses_endpoint_handler( original_request.input, inline_rag_context.context_text ) - api_params = ResponsesApiParams.model_validate(updated_request.model_dump()) - if "max_infer_iters" not in original_request.model_fields_set: - api_params.max_infer_iters = configuration.inference.max_infer_iters + updated_request.max_infer_iters = configuration.inference.max_infer_iters if "max_tool_calls" not in original_request.model_fields_set: - api_params.max_tool_calls = configuration.inference.max_tool_calls + updated_request.max_tool_calls = configuration.inference.max_tool_calls + + api_params = ResponsesApiParams.model_validate(updated_request.model_dump()) # Compact the conversation if it is approaching the context window limit. # /v1/responses is OpenAI-compatible, so compaction is silent (no custom SSE