From 96771a83427bcbd1fa87bbc010a031b51cb16604 Mon Sep 17 00:00:00 2001
From: arin-deloatch <arindeloatch@gmail.com>
Date: Thu, 25 Jun 2026 10:57:02 -0700
Subject: [PATCH 1/4] LCORE-1613: Add max_infer_iters and max_tool_calls to
 InferenceConfiguration

---
 src/models/config.py                          | 20 ++++++
 .../config/test_inference_configuration.py    | 68 +++++++++++++++++++
 2 files changed, 88 insertions(+)

diff --git a/src/models/config.py b/src/models/config.py
index fdd6112f4..5462a4680 100644
--- a/src/models/config.py
+++ b/src/models/config.py
@@ -1727,6 +1727,26 @@ class InferenceConfiguration(ConfigurationBase):
         "meaning and are independent of this list.",
     )
 
+    max_infer_iters: Optional[PositiveInt] = Field(
+        default=10,
+        title="Default max inference iterations",
+        description="Server-side default for the maximum number of inference "
+        "iterations a model can perform in a single request. Prevents small "
+        "models from looping indefinitely on tool calls. "
+        "Per-request values take precedence over this default. "
+        "Set to None to disable the limit.",
+    )
+
+    max_tool_calls: Optional[PositiveInt] = Field(
+        default=30,
+        title="Default max tool calls",
+        description="Server-side default for the maximum number of tool calls "
+        "allowed in a single response. Prevents small models from exhausting "
+        "the context window with repeated tool calls. "
+        "Per-request values take precedence over this default. "
+        "Set to None to disable the limit.",
+    )
+
     @model_validator(mode="after")
     def check_default_model_and_provider(self) -> Self:
         """
diff --git a/tests/unit/models/config/test_inference_configuration.py b/tests/unit/models/config/test_inference_configuration.py
index 8465e2f8b..a9cfc4cc5 100644
--- a/tests/unit/models/config/test_inference_configuration.py
+++ b/tests/unit/models/config/test_inference_configuration.py
@@ -90,3 +90,71 @@ def test_context_windows_rejects_negative_size() -> None:
         InferenceConfiguration(
             context_windows={"openai/gpt-4o-mini": -1},
         )  # pyright: ignore[reportCallIssue]
+
+
+def test_max_infer_iters_default() -> None:
+    """Test that max_infer_iters defaults to 10."""
+    config = InferenceConfiguration()  # pyright: ignore[reportCallIssue]
+    assert config.max_infer_iters == 10
+
+
+def test_max_tool_calls_default() -> None:
+    """Test that max_tool_calls defaults to 30."""
+    config = InferenceConfiguration()  # pyright: ignore[reportCallIssue]
+    assert config.max_tool_calls == 30
+
+
+def test_max_infer_iters_accepts_positive_int() -> None:
+    """Test that max_infer_iters accepts a positive integer."""
+    config = InferenceConfiguration(
+        max_infer_iters=5
+    )  # pyright: ignore[reportCallIssue]
+    assert config.max_infer_iters == 5
+
+
+def test_max_tool_calls_accepts_positive_int() -> None:
+    """Test that max_tool_calls accepts a positive integer."""
+    config = InferenceConfiguration(
+        max_tool_calls=20
+    )  # pyright: ignore[reportCallIssue]
+    assert config.max_tool_calls == 20
+
+
+def test_max_infer_iters_rejects_zero() -> None:
+    """Test that max_infer_iters rejects zero."""
+    with pytest.raises(ValueError):
+        InferenceConfiguration(max_infer_iters=0)  # pyright: ignore[reportCallIssue]
+
+
+def test_max_infer_iters_rejects_negative() -> None:
+    """Test that max_infer_iters rejects a negative value."""
+    with pytest.raises(ValueError):
+        InferenceConfiguration(max_infer_iters=-1)  # pyright: ignore[reportCallIssue]
+
+
+def test_max_tool_calls_rejects_zero() -> None:
+    """Test that max_tool_calls rejects zero."""
+    with pytest.raises(ValueError):
+        InferenceConfiguration(max_tool_calls=0)  # pyright: ignore[reportCallIssue]
+
+
+def test_max_tool_calls_rejects_negative() -> None:
+    """Test that max_tool_calls rejects a negative value."""
+    with pytest.raises(ValueError):
+        InferenceConfiguration(max_tool_calls=-1)  # pyright: ignore[reportCallIssue]
+
+
+def test_max_infer_iters_accepts_none() -> None:
+    """Test that max_infer_iters accepts None to disable the limit."""
+    config = InferenceConfiguration(
+        max_infer_iters=None
+    )  # pyright: ignore[reportCallIssue]
+    assert config.max_infer_iters is None
+
+
+def test_max_tool_calls_accepts_none() -> None:
+    """Test that max_tool_calls accepts None to disable the limit."""
+    config = InferenceConfiguration(
+        max_tool_calls=None
+    )  # pyright: ignore[reportCallIssue]
+    assert config.max_tool_calls is None

From 2842b6b340c649aae69607eaa3e0b7817e94624b Mon Sep 17 00:00:00 2001
From: arin-deloatch <arindeloatch@gmail.com>
Date: Thu, 25 Jun 2026 10:57:45 -0700
Subject: [PATCH 2/4] LCORE-1613: Apply inference defaults in query and
 responses endpoints

---
 src/app/endpoints/responses.py | 5 +++++
 src/utils/responses.py         | 2 ++
 2 files changed, 7 insertions(+)

diff --git a/src/app/endpoints/responses.py b/src/app/endpoints/responses.py
index 6d0cd9b05..286c95d74 100644
--- a/src/app/endpoints/responses.py
+++ b/src/app/endpoints/responses.py
@@ -466,6 +466,11 @@ async def responses_endpoint_handler(
 
     api_params = ResponsesApiParams.model_validate(updated_request.model_dump())
 
+    if "max_infer_iters" not in original_request.model_fields_set:
+        api_params.max_infer_iters = configuration.inference.max_infer_iters
+    if "max_tool_calls" not in original_request.model_fields_set:
+        api_params.max_tool_calls = configuration.inference.max_tool_calls
+
     # Compact the conversation if it is approaching the context window limit.
     # /v1/responses is OpenAI-compatible, so compaction is silent (no custom SSE
     # event): summarization happens before the response is created, and the turn
diff --git a/src/utils/responses.py b/src/utils/responses.py
index 3141742d8..5e5916e0b 100644
--- a/src/utils/responses.py
+++ b/src/utils/responses.py
@@ -429,6 +429,8 @@ async def prepare_responses_params(  # pylint: disable=too-many-arguments,too-ma
         stream=stream,
         store=store,
         extra_headers=extra_headers,
+        max_infer_iters=configuration.inference.max_infer_iters,
+        max_tool_calls=configuration.inference.max_tool_calls,
     )
 
 

From 59c9c0a0d7896a30c019a05c092383da16b23fda Mon Sep 17 00:00:00 2001
From: arin-deloatch <arindeloatch@gmail.com>
Date: Thu, 25 Jun 2026 10:58:48 -0700
Subject: [PATCH 3/4] LCORE-1613: Fix test mocks broken by inference defaults

---
 tests/unit/utils/test_responses.py | 23 ++++++++++++++---------
 1 file changed, 14 insertions(+), 9 deletions(-)

diff --git a/tests/unit/utils/test_responses.py b/tests/unit/utils/test_responses.py
index 530502869..8863ddd7c 100644
--- a/tests/unit/utils/test_responses.py
+++ b/tests/unit/utils/test_responses.py
@@ -56,7 +56,12 @@
 import constants
 from models.api.requests import QueryRequest
 from models.common.responses.types import InputTool, InputToolMCP
-from models.config import ApprovalFilter, ByokRag, ModelContextProtocolServer
+from models.config import (
+    ApprovalFilter,
+    ByokRag,
+    InferenceConfiguration,
+    ModelContextProtocolServer,
+)
 from utils.query import normalize_vertex_ai_model_id
 from utils.responses import (
     _build_chunk_attributes,
@@ -1976,7 +1981,7 @@ async def test_prepare_responses_params_with_conversation_id(
         )  # pyright: ignore[reportCallIssue]
 
         mock_config = mocker.Mock()
-        mock_config.inference = None
+        mock_config.inference = InferenceConfiguration()
         mocker.patch("utils.responses.configuration", mock_config)
         mocker.patch("utils.responses.get_system_prompt", return_value="System prompt")
         mocker.patch("utils.responses.prepare_tools", return_value=None)
@@ -2012,7 +2017,7 @@ async def test_prepare_responses_params_create_conversation(
         query_request = QueryRequest(query="test")  # pyright: ignore[reportCallIssue]
 
         mock_config = mocker.Mock()
-        mock_config.inference = None
+        mock_config.inference = InferenceConfiguration()
         mocker.patch("utils.responses.configuration", mock_config)
         mocker.patch("utils.responses.get_system_prompt", return_value="System prompt")
         mocker.patch("utils.responses.prepare_tools", return_value=None)
@@ -2038,7 +2043,7 @@ async def test_prepare_responses_params_connection_error_on_models(
 
         query_request = QueryRequest(query="test")  # pyright: ignore[reportCallIssue]
         mock_config = mocker.Mock()
-        mock_config.inference = None
+        mock_config.inference = InferenceConfiguration()
         mocker.patch("utils.responses.configuration", mock_config)
 
         with pytest.raises(HTTPException) as exc_info:
@@ -2064,7 +2069,7 @@ async def test_prepare_responses_params_connection_error_on_conversation(
         query_request = QueryRequest(query="test")  # pyright: ignore[reportCallIssue]
 
         mock_config = mocker.Mock()
-        mock_config.inference = None
+        mock_config.inference = InferenceConfiguration()
         mocker.patch("utils.responses.configuration", mock_config)
         mocker.patch("utils.responses.get_system_prompt", return_value="System prompt")
         mocker.patch("utils.responses.prepare_tools", return_value=None)
@@ -2088,7 +2093,7 @@ async def test_prepare_responses_params_api_status_error_on_models(
 
         query_request = QueryRequest(query="test")  # pyright: ignore[reportCallIssue]
         mock_config = mocker.Mock()
-        mock_config.inference = None
+        mock_config.inference = InferenceConfiguration()
         mocker.patch("utils.responses.configuration", mock_config)
 
         with pytest.raises(HTTPException) as exc_info:
@@ -2131,7 +2136,7 @@ async def test_prepare_responses_params_includes_mcp_provider_data_headers(
         ]
 
         mock_config = mocker.Mock()
-        mock_config.inference = None
+        mock_config.inference = InferenceConfiguration()
         mocker.patch("utils.responses.configuration", mock_config)
         mocker.patch("utils.responses.get_system_prompt", return_value="System prompt")
         mocker.patch(
@@ -2179,7 +2184,7 @@ async def test_prepare_responses_params_no_extra_headers_without_mcp_tools(
         query_request = QueryRequest(query="test")  # pyright: ignore[reportCallIssue]
 
         mock_config = mocker.Mock()
-        mock_config.inference = None
+        mock_config.inference = InferenceConfiguration()
         mocker.patch("utils.responses.configuration", mock_config)
         mocker.patch("utils.responses.get_system_prompt", return_value="System prompt")
         mocker.patch("utils.responses.prepare_tools", return_value=None)
@@ -2211,7 +2216,7 @@ async def test_prepare_responses_params_api_status_error_on_conversation(
         query_request = QueryRequest(query="test")  # pyright: ignore[reportCallIssue]
 
         mock_config = mocker.Mock()
-        mock_config.inference = None
+        mock_config.inference = InferenceConfiguration()
         mocker.patch("utils.responses.configuration", mock_config)
         mocker.patch("utils.responses.get_system_prompt", return_value="System prompt")
         mocker.patch("utils.responses.prepare_tools", return_value=None)

From de9d863d5f59cd566104f6911ba24883aa0d1ed7 Mon Sep 17 00:00:00 2001
From: arin-deloatch <arindeloatch@gmail.com>
Date: Fri, 26 Jun 2026 10:55:28 -0700
Subject: [PATCH 4/4] LCORE-1613: generate OpenAPI spec; address code pattern
 nit

---
 docs/openapi.json              | 28 ++++++++++++++++++++++++++++
 src/app/endpoints/responses.py |  8 ++++----
 2 files changed, 32 insertions(+), 4 deletions(-)

diff --git a/docs/openapi.json b/docs/openapi.json
index 7f9a26513..f04cd0382 100644
--- a/docs/openapi.json
+++ b/docs/openapi.json
@@ -13614,6 +13614,34 @@
                         "type": "array",
                         "title": "High-level inference providers",
                         "description": "Unified-mode synthesis input (Decision S5): a high-level, backend-agnostic list of inference providers the synthesizer expands into Llama Stack provider entries. Lives at the configuration root so it survives a future backend change. A non-empty list signals unified mode. Empty (the default) leaves legacy/remote modes unaffected. The sibling default_model / default_provider keep their query-time routing meaning and are independent of this list."
+                    },
+                    "max_infer_iters": {
+                        "anyOf": [
+                            {
+                                "type": "integer",
+                                "exclusiveMinimum": 0.0
+                            },
+                            {
+                                "type": "null"
+                            }
+                        ],
+                        "title": "Default max inference iterations",
+                        "description": "Server-side default for the maximum number of inference iterations a model can perform in a single request. Prevents small models from looping indefinitely on tool calls. Per-request values take precedence over this default. Set to None to disable the limit.",
+                        "default": 10
+                    },
+                    "max_tool_calls": {
+                        "anyOf": [
+                            {
+                                "type": "integer",
+                                "exclusiveMinimum": 0.0
+                            },
+                            {
+                                "type": "null"
+                            }
+                        ],
+                        "title": "Default max tool calls",
+                        "description": "Server-side default for the maximum number of tool calls allowed in a single response. Prevents small models from exhausting the context window with repeated tool calls. Per-request values take precedence over this default. Set to None to disable the limit.",
+                        "default": 30
                     }
                 },
                 "additionalProperties": false,
diff --git a/src/app/endpoints/responses.py b/src/app/endpoints/responses.py
index 286c95d74..6d77a5799 100644
--- a/src/app/endpoints/responses.py
+++ b/src/app/endpoints/responses.py
@@ -464,12 +464,12 @@ async def responses_endpoint_handler(
             original_request.input, inline_rag_context.context_text
         )
 
-    api_params = ResponsesApiParams.model_validate(updated_request.model_dump())
-
     if "max_infer_iters" not in original_request.model_fields_set:
-        api_params.max_infer_iters = configuration.inference.max_infer_iters
+        updated_request.max_infer_iters = configuration.inference.max_infer_iters
     if "max_tool_calls" not in original_request.model_fields_set:
-        api_params.max_tool_calls = configuration.inference.max_tool_calls
+        updated_request.max_tool_calls = configuration.inference.max_tool_calls
+
+    api_params = ResponsesApiParams.model_validate(updated_request.model_dump())
 
     # Compact the conversation if it is approaching the context window limit.
     # /v1/responses is OpenAI-compatible, so compaction is silent (no custom SSE