Skip to content
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion src/app/endpoints/query.py
Original file line number Diff line number Diff line change
Expand Up @@ -233,7 +233,7 @@ async def query_endpoint_handler(
responses_params,
moderation_result,
endpoint_path,
compaction.original_input if compaction.compacted else None,
compaction.original_input,
)

if moderation_result.decision == "passed":
Expand Down
14 changes: 5 additions & 9 deletions src/app/endpoints/streaming_query.py
Original file line number Diff line number Diff line change
Expand Up @@ -338,6 +338,7 @@ async def streaming_query_endpoint_handler( # pylint: disable=too-many-locals
responses_params=responses_params,
context=context,
endpoint_path=endpoint_path,
original_input=None,
)

# Combine inline RAG results (BYOK + Solr) with tool-based results
Expand All @@ -353,6 +354,8 @@ async def streaming_query_endpoint_handler( # pylint: disable=too-many-locals
responses_params=responses_params,
turn_summary=turn_summary,
background_topic_summary_tasks=_background_topic_summary_tasks,
emit_start=True,
original_input=None,
),
media_type=response_media_type,
)
Expand Down Expand Up @@ -387,7 +390,6 @@ async def retrieve_response_generator(
if context.moderation_result.decision == "blocked":
turn_summary.llm_response = context.moderation_result.message
turn_summary.id = context.moderation_result.moderation_id
turn_summary.output_items = [context.moderation_result.refusal_response]
# In compacted mode the conversation parameter was omitted, so the
# refusal turn (with the original input) is persisted by
# generate_response; storing it here too would duplicate it.
Expand Down Expand Up @@ -506,6 +508,7 @@ async def generate_response_with_compaction(
responses_params=responses_params,
context=context,
endpoint_path=endpoint_path,
original_input=compacted_original_input,
)
except HTTPException as e:
yield http_exception_stream_event(e)
Expand Down Expand Up @@ -699,7 +702,7 @@ async def generate_response( # pylint: disable=too-many-arguments,too-many-posi
if original_input is not None
else context.query_request.query
),
turn_summary.output_items,
[], # field was removed from TurnSummary
)
except Exception: # pylint: disable=broad-except
logger.exception(
Expand Down Expand Up @@ -873,10 +876,6 @@ async def response_generator( # pylint: disable=too-many-branches,too-many-stat
getattr(chunk, "response"), # noqa: B009
)
turn_summary.llm_response = turn_summary.llm_response or "".join(text_parts)
# Capture structured output items for compacted-mode turn storage
# (LCORE-1572), so the persisted turn keeps non-text output items
# rather than being flattened to the response text.
turn_summary.output_items = list(latest_response_object.output or [])
yield stream_event(
{
"id": chunk_id,
Expand All @@ -893,9 +892,6 @@ async def response_generator( # pylint: disable=too-many-branches,too-many-stat
OpenAIResponseObject,
getattr(chunk, "response"), # noqa: B009
)
# Capture any partial output items so a compacted-mode turn is not
# persisted with empty output on these terminals (LCORE-1572).
turn_summary.output_items = list(latest_response_object.output or [])
error_message = (
latest_response_object.error.message
if latest_response_object.error
Expand Down
6 changes: 0 additions & 6 deletions src/models/common/turn_summary.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,6 @@

from typing import Any, Optional

from llama_stack_api import OpenAIResponseOutput
from pydantic import AnyUrl, BaseModel, Field

from utils.token_counter import TokenCounter
Expand Down Expand Up @@ -109,11 +108,6 @@ class TurnSummary(BaseModel):
rag_chunks: list[RAGChunk] = Field(default_factory=list)
referenced_documents: list[ReferencedDocument] = Field(default_factory=list)
token_usage: TokenCounter = Field(default_factory=TokenCounter)
output_items: list[OpenAIResponseOutput] = Field(
default_factory=list,
description="Structured response output items, captured for compacted-mode "
"turn persistence (LCORE-1572). Empty on the non-compacted path.",
)


class ToolInfoSummary(BaseModel):
Expand Down
11 changes: 9 additions & 2 deletions src/pydantic_ai_lightspeed/llamastack/__init__.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,13 @@
"""Pydantic AI provider for Llama Stack."""

from pydantic_ai_lightspeed.llamastack._model import LlamaStackResponsesModel
from pydantic_ai_lightspeed.llamastack._model import (
CompactionTurnContext,
LlamaStackResponsesModel,
)
from pydantic_ai_lightspeed.llamastack._provider import LlamaStackProvider

__all__ = ["LlamaStackProvider", "LlamaStackResponsesModel"]
__all__ = [
"CompactionTurnContext",
"LlamaStackProvider",
"LlamaStackResponsesModel",
]
Loading
Loading