diff --git a/.gitignore b/.gitignore index 831ec5d..255f4a7 100644 --- a/.gitignore +++ b/.gitignore @@ -12,3 +12,7 @@ test_lhm/ table_info_db ko_reranker_local *.csv +bot.log +lang2sql_data.db +lang2sql-datasets/ +docs/lang2sql-datasets.zip diff --git a/CLAUDE.md b/CLAUDE.md new file mode 100644 index 0000000..ee0a2fb --- /dev/null +++ b/CLAUDE.md @@ -0,0 +1,63 @@ +# Lang2SQL — Claude Code 작업 가이드 + +## 프로젝트 정체성 + +"더 좋은 SQL 생성기"가 아니라 **현실의 messy DB에서도 답하는 분석 에이전트**. +Vanna/Wren이 못 푸는 4가지 현실 문제(DB 강건성, 기억, 문서 ingestion, 팀별 시멘틱 분기)를 다룬다. + +## 4기둥 현황 + +| ★ | 기둥 | V1 상태 | V1.5 목표 | +|---|---|---|---| +| ① | **Safety pipeline + DB 강건성** | whitelist/timeout layer만 존재. 메타데이터 자동 보강 **미구현** | 자동 보강이 핵심 차별점 | +| ② | Memory 3축 | in-memory store, inject-all recall, manual extractor | SQLite/keyword/auto | +| ③ | Ingestion matrix | file source + LLM extractor | URL/Notion/DDL | +| ④ | Semantic federation | 3-scope merge 동작 | diff/promote 커맨드 | + +## 아키텍처 한 줄 요약 + +``` +frontends → tenancy(조립점) → harness(agent_loop) → 4기둥 → core ports ← adapters +``` + +- `core/ports/` — Protocol 정의만, 외부 의존 0. **건드리지 말 것** +- `adapters/` — 외부 시스템 구체 구현 (DB, LLM, storage) +- `tenancy/concierge.py` — 유일한 조립점 (구체 클래스 import 허용) + +## 현재 작업 포커스: DB 강건성 (★①) 고도화 + +### 문제 +`Column.description`이 비어 있는 실무 DB에서 LLM이 컬럼 의미를 모름 → 잘못된 SQL 생성. + +### 관련 파일 +- [src/lang2sql/core/ports/explorer.py](src/lang2sql/core/ports/explorer.py) — `Column.description` 필드 (v1.5 자동 보강 예정 주석 있음) +- [src/lang2sql/adapters/db/sqlalchemy_explorer.py](src/lang2sql/adapters/db/sqlalchemy_explorer.py) — `_describe_table_sync`: `c.get("comment") or ""`로 DB comment만 읽음 +- [src/lang2sql/tools/explore_schema.py](src/lang2sql/tools/explore_schema.py) — description 있을 때만 프롬프트에 노출 +- [src/lang2sql/harness/system_prompt.py](src/lang2sql/harness/system_prompt.py) — 스키마 주입 위치 +- [src/lang2sql/safety/](src/lang2sql/safety/) — pipeline.py + layers/ (새 layer 추가 시 여기) + +### 확장 패턴 (기존 코드 안 건드리고 추가) +- 새 safety layer → `safety/layers/.py`에 `SafetyLayerPort` 구현 후 `pipeline.py` 목록에 끼우기 +- 새 DB 어댑터 → `adapters/db/_explorer.py`에 `ExplorerPort` 구현 후 `factory.py`에 scheme 분기 +- 메타데이터 보강 → `ExplorerPort`에 `enrich_metadata()` 메서드 추가 또는 별도 enricher 포트로 추상화 + +## 개발 환경 + +```bash +cd /home/sewon/project/Lang2SQL +uv sync +.venv/bin/pytest -q # 110개 테스트 (safety 12개 회귀 포함) +.venv/bin/python bench/ecommerce_demo.py # federation + safety 데모 +``` + +## Git 브랜치 전략 + +- 내 포크: `git@github.com:thrcle/Lang2SQL.git` (origin) +- 업스트림: `https://github.com/CausalInferenceLab/Lang2SQL.git` (upstream) +- 작업 브랜치 생성 후 origin에 push → upstream으로 PR +- 브랜치 가이드: `docs/branch_guidelines.md`, PR 가이드: `docs/pull_request_guidelines.md` + +## 테스트 원칙 + +- safety 회귀 12케이스는 **머지 게이트** — 새 layer 추가 시 반드시 케이스 추가 +- `adapters/llm/fake.py`로 오프라인 LLM 테스트 가능 diff --git a/README.md b/README.md index 09fd9fc..d072219 100644 --- a/README.md +++ b/README.md @@ -106,11 +106,11 @@ The bot exits loudly if `DISCORD_BOT_TOKEN` is unset. Full setup and hosting: ## What V1 does / does NOT do yet (honesty section) **Does:** -- 3-scope semantic federation (guild / channel / thread) with most-specific-wins - resolution; `define_metric` writes to the current scope. +- 3-scope semantic federation (guild / channel / member) with most-specific-wins + resolution; `term_custom` registers definitions per scope (KV-backed). - Safety pipeline with the V1 layers (whitelist + timeout), gating every query. -- Agent loop with six tools: `run_sql`, `explore_schema`, `define_metric`, - `ingest_doc`, `remember`, `ask_user`. +- Agent loop with eight tools: `run_sql`, `explore_schema`, `enrich_schema`, + `term_custom`, `org_setup`, `ingest_doc`, `remember`, `ask_user`. - Memory service (in-memory store + inject-all recall + manual `/remember`). - Discord frontend (bot, commands, session router, render). - Encrypted-at-rest secrets (Fernet) and SQLite-backed persistence. diff --git a/bench/ecommerce_demo.py b/bench/ecommerce_demo.py index b910854..0fab70d 100644 --- a/bench/ecommerce_demo.py +++ b/bench/ecommerce_demo.py @@ -4,9 +4,9 @@ Run: .venv/bin/python bench/ecommerce_demo.py This is the study-group demo. It exercises the *real* V1 code paths -(``ContextConcierge`` + scope resolver + the canned Postgres explorer + the -offline ``FakeLLM``) to show three things end-to-end without a token or a live -database: +(``ContextConcierge`` + KV-backed federation + the canned Postgres explorer + +the offline ``FakeLLM``) to show three things end-to-end without a token or a +live database: Section 1 — define three e-commerce metrics in a channel and read them back. Section 2 — ★④ semantic federation: the *same* term ``active_user`` carries @@ -22,13 +22,13 @@ import asyncio +from lang2sql.adapters.storage.sqlite_store import SqliteStore from lang2sql.core.identity import Identity from lang2sql.core.ports.safety import SafetyContext, Verdict from lang2sql.harness.loop import agent_loop from lang2sql.safety.pipeline import SafetyPipeline -from lang2sql.semantic.types import Metric from lang2sql.tenancy.concierge import ContextConcierge -from lang2sql.tenancy.scope_resolver import ScopeResolver +from lang2sql.tools.semantic_federation import FedEntry, _kv_key, _render_effective, _load_all, _resolve_term # Stable IDs for the demo guild and its two channels. GUILD = "acme-shop" @@ -50,20 +50,13 @@ def _finance_identity() -> Identity: return Identity(user_id="evan", guild_id=GUILD, channel_id=CH_FINANCE) +def _define_term(store: SqliteStore, scope: str, term: str, layer: str, entity: str, definition: str) -> None: + entry = FedEntry(term=term, layer=layer, entity=entity, definition=definition) + store.kv_set(scope, _kv_key(term, layer, entity), entry.to_json()) + + async def section_0_harness(concierge: ContextConcierge) -> None: - """Drive one full agent turn through the assembled harness (offline). - - This is the *wiring* proof, not an intelligence proof: ``ContextConcierge`` - picks the offline FakeLLM (no OPENAI_API_KEY), starts a session, and wires - the canned Postgres explorer + six tools into a ``HarnessContext`` that - ``agent_loop`` drives LLM → tool → LLM to a final answer. No network, no - real database. - - The FakeLLM is a deterministic stub: it blindly calls the first tool - (``run_sql``) with placeholder args, so its turn ends up *demonstrating the - safety gate* rather than answering the question. With OPENAI_API_KEY set, - the same loop calls gpt-4.1-mini instead — zero other code changes. - """ + """Drive one full agent turn through the assembled harness (offline).""" _hr("SECTION 0 — assembled harness runs one turn (ContextConcierge + FakeLLM)") ident = _marketing_identity() @@ -80,64 +73,67 @@ async def section_0_harness(concierge: ContextConcierge) -> None: print(" ★① behaviour Section 3 isolates.") -async def section_1_define_metrics(resolver: ScopeResolver) -> None: +async def section_1_define_metrics(store: SqliteStore) -> None: """Define three e-commerce metrics in #marketing and read them back.""" _hr("SECTION 1 — define three metrics (★① business-context learning)") ident = _marketing_identity() - scope = ident.default_write_scope() # current channel by default - print(f"Writing to default scope for this channel: {scope}\n") + channel_id = ident.channel_id or "" + scope = ident.guild_id or GUILD + print(f"Writing to channel layer for #{CH_MARKETING} (channel_id={channel_id})\n") metrics = [ - Metric("total_revenue", "SUM(orders.amount) WHERE status != 'cancelled'"), - Metric("aov", "total_revenue / COUNT(DISTINCT orders.id)"), - Metric("paid_orders", "COUNT(*) FROM orders WHERE status = 'paid'"), + ("total_revenue", "SUM(orders.amount) WHERE status != 'cancelled'"), + ("aov", "total_revenue / COUNT(DISTINCT orders.id)"), + ("paid_orders", "COUNT(*) FROM orders WHERE status = 'paid'"), ] - for m in metrics: - await resolver.define(scope, m) - print(f" defined {m.name:>14} = {m.definition}") + for name, definition in metrics: + _define_term(store, scope, name, "channel", channel_id, definition) + print(f" defined {name:>14} = {definition}") - layer = await resolver.effective_layer(ident) - print(f"\nEffective layer for #{CH_MARKETING} now holds " - f"{len(layer.entries)} definition(s):") - print(layer.render()) + rendered = _render_effective(store, scope, channel_id, ident.user_id) + lines = [l for l in rendered.splitlines() if l.startswith("-")] + print(f"\nEffective layer for #{CH_MARKETING} now holds {len(lines)} definition(s):") + print(rendered) -async def section_2_federation(resolver: ScopeResolver) -> None: +async def section_2_federation(store: SqliteStore) -> None: """Same term, two channels, two definitions — no conflict (★④).""" _hr("SECTION 2 — semantic federation: one term, two definitions (★④)") - # #marketing defines active_user one way ... mkt = _marketing_identity() - await resolver.define( - mkt.default_write_scope(), - Metric("active_user", "user with a login event in the last 30 days"), - ) - # ... and #finance defines the SAME name a different way. fin = _finance_identity() - await resolver.define( - fin.default_write_scope(), - Metric("active_user", "user with an active paid subscription"), - ) + + _define_term(store, GUILD, "active_user", "channel", CH_MARKETING, + "user with a login event in the last 30 days") + _define_term(store, GUILD, "active_user", "channel", CH_FINANCE, + "user with an active paid subscription") print("Defined 'active_user' independently in two channels.\n") print("Now resolving the *effective* definition each channel sees") print("by walking its scope chain (most specific scope wins):\n") - mkt_layer = await resolver.effective_layer(mkt) - fin_layer = await resolver.effective_layer(fin) - mkt_def = mkt_layer.lookup("active_user") - fin_def = fin_layer.lookup("active_user") + mkt_rendered = _render_effective(store, GUILD, CH_MARKETING, mkt.user_id) + fin_rendered = _render_effective(store, GUILD, CH_FINANCE, fin.user_id) + + # Read definitions directly from the store — don't parse rendered display text + by_term = _load_all(store, GUILD) + entries = by_term.get("active_user", []) + mkt_raw = store.kv_get(GUILD, _kv_key("active_user", "channel", CH_MARKETING)) + fin_raw = store.kv_get(GUILD, _kv_key("active_user", "channel", CH_FINANCE)) + mkt_def = FedEntry.from_json(mkt_raw).definition if mkt_raw else "" + fin_def = FedEntry.from_json(fin_raw).definition if fin_raw else "" - print(f" #{CH_MARKETING:<10} active_user → {mkt_def.definition}") - print(f" #{CH_FINANCE:<10} active_user → {fin_def.definition}") + print(f" #{CH_MARKETING:<10} active_user → {mkt_def}") + print(f" #{CH_FINANCE:<10} active_user → {fin_def}") - assert mkt_def.definition != fin_def.definition + assert mkt_def and fin_def and mkt_def != fin_def, ( + f"Federation failed: mkt_def={mkt_def!r}, fin_def={fin_def!r}" + ) print("\n ✅ Same term, two live definitions, zero conflict.") print(" Each channel is its own branch in the federation tree;") print(" neither overwrote the other. (Wren's single MDL cannot do this.)") - # Show the scope chain that produced the marketing answer. chain = " → ".join(str(s) for s in mkt.scope_chain()) print(f"\n #{CH_MARKETING} resolution order: {chain}") print(" Lookup stops at the first scope that defines the name (CHANNEL),") @@ -173,14 +169,13 @@ def section_3_safety(pipeline: SafetyPipeline) -> None: async def main() -> None: print("Lang2SQL v4.1 — e-commerce demo (offline: FakeLLM, canned PG, in-memory)") - # One shared resolver so federation state persists across sections 1 and 2. - resolver = ScopeResolver() + store = SqliteStore() pipeline = SafetyPipeline() concierge = ContextConcierge() await section_0_harness(concierge) - await section_1_define_metrics(resolver) - await section_2_federation(resolver) + await section_1_define_metrics(store) + await section_2_federation(store) section_3_safety(pipeline) _hr("DONE") diff --git a/docs/ARCHITECTURE.md b/docs/ARCHITECTURE.md index b7fc58f..9e6f1b1 100644 --- a/docs/ARCHITECTURE.md +++ b/docs/ARCHITECTURE.md @@ -64,21 +64,18 @@ 시스템 전체의 *어휘*가 모여 있습니다. 외부 의존 0, I/O 0. - [`types.py`](../src/lang2sql/core/types.py) — `Message`, `ToolCall`, `ToolResult`, `Completion`, `Role` - [`identity.py`](../src/lang2sql/core/identity.py) — `Identity`, `Scope`, federation의 `scope_chain()` 순서 (narrow→wide) -- [`ports/`](../src/lang2sql/core/ports/) — 11개 Protocol: `LLMPort`, `ExplorerPort`, `ToolPort`, `SafetyLayerPort`, `SafetyPipelinePort`, `StorePort`, `RecallPort`, `ExtractorPort` (memory), `SourcePort`, `DocExtractorPort`, `ScopeResolverPort`, `FrontendPort`, `SecretsPort`, `SessionStorePort`, `AuditPort` +- [`ports/`](../src/lang2sql/core/ports/) — Protocol: `LLMPort`, `ExplorerPort`, `ToolPort`, `SafetyLayerPort`, `SafetyPipelinePort`, `StorePort`, `RecallPort`, `ExtractorPort` (memory), `SourcePort`, `DocExtractorPort`, `FrontendPort`, `SecretsPort`, `SessionStorePort`, `AuditPort` ### `src/lang2sql/harness/` — 에이전트 한 턴의 엔진 -- [`context.py`](../src/lang2sql/harness/context.py) — `HarnessContext` (llm + tools + safety + explorer + scope_resolver + session 한 다발) +- [`context.py`](../src/lang2sql/harness/context.py) — `HarnessContext` (llm + tools + safety + explorer + store + session 한 다발) - [`session.py`](../src/lang2sql/harness/session.py) — 대화 transcript - [`loop.py`](../src/lang2sql/harness/loop.py) — `agent_loop`: system prompt → LLM → tool 호출 → 다음 턴 - [`tool_registry.py`](../src/lang2sql/harness/tool_registry.py) — 이름→도구 dispatch - [`system_prompt.py`](../src/lang2sql/harness/system_prompt.py) — 시멘틱 + 스키마 주입 -### `src/lang2sql/semantic/` — 시멘틱 레이어 + federation (★④) +### `src/lang2sql/semantic/` — 시멘틱 타입 정의 (★④) - [`types.py`](../src/lang2sql/semantic/types.py) — `SemanticEntry` (METRIC/DIMENSION/RELATIONSHIP/RULE) -- [`layer.py`](../src/lang2sql/semantic/layer.py) — `SemanticLayer.render()` (시스템 프롬프트로 들어감) -- [`scoped_layer.py`](../src/lang2sql/semantic/scoped_layer.py) — *가장 구체적 scope가 승리*하는 merge -- [`store.py`](../src/lang2sql/semantic/store.py) — in-memory store -- [`sql_composer.py`](../src/lang2sql/semantic/sql_composer.py) — metric 이름 → 정의 펼치기 (V1 최소) +- Federation 로직은 [`tools/semantic_federation.py`](../src/lang2sql/tools/semantic_federation.py)로 통합 (KV 기반) ### `src/lang2sql/safety/` — Read-only 게이트 (★①) - [`pipeline.py`](../src/lang2sql/safety/pipeline.py) — layer를 순서대로 통과, *첫 비-PASS에서 차단* @@ -98,10 +95,12 @@ - [`pipeline.py`](../src/lang2sql/ingestion/pipeline.py) — Source × Extractor matrix ### `src/lang2sql/tools/` — 에이전트가 부르는 capability -6개 도구 (모두 ctx-aware, async): +8개 도구 (모두 ctx-aware, async): - [`run_sql.py`](../src/lang2sql/tools/run_sql.py) — safety 통과 후 explorer로 실행 - [`explore_schema.py`](../src/lang2sql/tools/explore_schema.py) — 테이블/컬럼 introspection -- [`define_metric.py`](../src/lang2sql/tools/define_metric.py) — scope-aware 정의 쓰기 +- [`enrich_schema.py`](../src/lang2sql/tools/enrich_schema.py) — LLM으로 컬럼 메타데이터 자동 보강 +- [`semantic_federation.py`](../src/lang2sql/tools/semantic_federation.py) — `term_custom`: guild/channel/member 계층 용어 사전 (KV 기반, narrow→wide lookup) +- [`org_setup.py`](../src/lang2sql/tools/org_setup.py) — 전사/팀 단위 용어 일괄 등록 - [`remember.py`](../src/lang2sql/tools/remember.py) — fact 저장 - [`ask_user.py`](../src/lang2sql/tools/ask_user.py) — 모호하면 사용자에게 질문 - [`ingest_doc.py`](../src/lang2sql/tools/ingest_doc.py) — 문서 → 후보 제안 @@ -109,7 +108,6 @@ ### `src/lang2sql/tenancy/` — 조립점 - [`concierge.py`](../src/lang2sql/tenancy/concierge.py) — *유일하게* 구체 클래스를 import 하는 곳. 요청마다 `HarnessContext` 만듦. -- [`scope_resolver.py`](../src/lang2sql/tenancy/scope_resolver.py) — `ScopeResolverPort` 구현 (semantic 위) - [`encrypted_secrets.py`](../src/lang2sql/tenancy/encrypted_secrets.py) — `cryptography.Fernet` 실 암호화 ### `src/lang2sql/adapters/` — 외부 시스템과의 마지막 줄 diff --git a/docs/PROJECT.md b/docs/PROJECT.md index 7b71dd8..abb0ca4 100644 --- a/docs/PROJECT.md +++ b/docs/PROJECT.md @@ -48,9 +48,9 @@ Vanna AI(~20k★), Wren AI(~12k★), SQLCoder 같은 Text-to-SQL 오픈소스들 - **core 포트 11종** — 모든 외부 의존을 Protocol로 추상화 - **harness** — agent_loop(LLM → tool → 다음 턴), Session, HarnessContext - **★①~★④ 4기둥** 최소 구현 — safety 12 회귀, memory 3축, ingestion 매트릭스, federation 3-scope -- **도구 6종** — run_sql · explore_schema · define_metric · remember · ask_user · ingest_doc -- **Discord 프론트엔드** — 6개 슬래시 명령 + `/setup` 위저드 (비개발자 DSN-free flow) + bot.py -- **영속화** — SQLite 시멘틱 store + Fernet 실암호화 secrets +- **도구 8종** — run_sql · explore_schema · enrich_schema · term_custom · org_setup · remember · ask_user · ingest_doc +- **Discord 프론트엔드** — 슬래시 명령 + `/setup` 위저드 (비개발자 DSN-free flow) + bot.py +- **영속화** — KV store(federation) + Fernet 실암호화 secrets - **DB 어댑터** — `SqlAlchemyExplorer` 1개로 Postgres/MySQL/Snowflake/BigQuery/DuckDB 커버 + Cloudflare D1 HTTP 어댑터 + `build_explorer(DSN)` 자동 라우팅 - **106개 자동화 테스트** (safety 회귀 12 포함) - **bench 데모** — federation + safety 라이브 시연 (`bench/ecommerce_demo.py`) diff --git a/src/lang2sql/adapters/storage/sqlite_semantic.py b/src/lang2sql/adapters/storage/sqlite_semantic.py deleted file mode 100644 index 8ee941c..0000000 --- a/src/lang2sql/adapters/storage/sqlite_semantic.py +++ /dev/null @@ -1,104 +0,0 @@ -"""SqliteSemanticStore — durable backing for the semantic layer (★④). - -Drop-in replacement for the in-memory :class:`~lang2sql.semantic.store.SemanticStore`: -same ``add(scope, entry)`` / ``entries_at(scope)`` surface, but rows live in a -stdlib :mod:`sqlite3` table so definitions survive a process restart. Wiring -``ScopeResolver(store=SqliteSemanticStore(path))`` is all federation needs to -become persistent — the resolver and merge logic stay untouched. - -Like the rest of V1's sqlite usage this is synchronous and inline; the store -itself exposes the plain ``add``/``entries_at`` shape (no ``async``), matching -:class:`SemanticStore` so it substitutes cleanly under :class:`ScopeResolver`. -""" - -from __future__ import annotations - -import sqlite3 - -from ...core.identity import Scope -from ...semantic.types import SemanticEntry, SemanticKind - - -class SqliteSemanticStore: - """Per-scope semantic entries persisted in a single sqlite table. - - The ``(scope, name)`` pair is the primary key, mirroring the in-memory - store's "one entry per name within a scope" rule: re-adding a name at the - same scope replaces the prior definition. - """ - - def __init__(self, path: str = ":memory:") -> None: - self.path = path - self._conn = sqlite3.connect(path, check_same_thread=False) - self._conn.row_factory = sqlite3.Row - self._create_tables() - - def _create_tables(self) -> None: - self._conn.executescript( - """ - CREATE TABLE IF NOT EXISTS semantic_entries ( - scope TEXT NOT NULL, - kind TEXT NOT NULL, - name TEXT NOT NULL, - definition TEXT NOT NULL, - applies_to TEXT NOT NULL DEFAULT '', - source_id TEXT NOT NULL DEFAULT '', - created_by TEXT NOT NULL DEFAULT '', - created_at TEXT NOT NULL DEFAULT '', - PRIMARY KEY (scope, name) - ); - """ - ) - self._conn.commit() - - def close(self) -> None: - self._conn.close() - - def add(self, scope: Scope, entry: SemanticEntry) -> None: - """Store ``entry`` at ``scope``, replacing a same-named entry there.""" - self._conn.execute( - "INSERT INTO semantic_entries " - "(scope, kind, name, definition, applies_to, source_id, created_by, created_at) " - "VALUES (?, ?, ?, ?, ?, ?, ?, ?) " - "ON CONFLICT(scope, name) DO UPDATE SET " - "kind = excluded.kind, definition = excluded.definition, " - "applies_to = excluded.applies_to, source_id = excluded.source_id, " - "created_by = excluded.created_by, created_at = excluded.created_at", - ( - str(scope), - entry.kind.value, - entry.name, - entry.definition, - entry.applies_to, - entry.source_id, - entry.created_by, - entry.created_at, - ), - ) - self._conn.commit() - - def entries_at(self, scope: Scope) -> list[SemanticEntry]: - """Entries authored exactly at ``scope`` (no inheritance).""" - rows = self._conn.execute( - "SELECT kind, name, definition, applies_to, source_id, created_by, created_at " - "FROM semantic_entries WHERE scope = ? ORDER BY rowid", - (str(scope),), - ).fetchall() - return [_row_to_entry(r) for r in rows] - - -def _row_to_entry(row: sqlite3.Row) -> SemanticEntry: - """Reconstruct a :class:`SemanticEntry` from a stored row. - - ``created_at`` is passed through verbatim so the persisted timestamp is - preserved rather than regenerated by ``__post_init__``. - """ - return SemanticEntry( - kind=SemanticKind(row["kind"]), - name=row["name"], - definition=row["definition"], - applies_to=row["applies_to"], - source_id=row["source_id"], - created_by=row["created_by"], - created_at=row["created_at"], - ) diff --git a/src/lang2sql/adapters/storage/sqlite_store.py b/src/lang2sql/adapters/storage/sqlite_store.py index ef0c674..559c28b 100644 --- a/src/lang2sql/adapters/storage/sqlite_store.py +++ b/src/lang2sql/adapters/storage/sqlite_store.py @@ -130,15 +130,27 @@ def kv_delete(self, scope: str, key: str) -> None: ) self._conn.commit() + @staticmethod + def _escape_like(s: str) -> str: + return s.replace("!", "!!").replace("%", "!%").replace("_", "!_") + def kv_delete_prefix(self, scope: str, prefix: str) -> int: """Delete all keys under scope that start with prefix. Returns count deleted.""" cur = self._conn.execute( - "DELETE FROM kv WHERE scope = ? AND key LIKE ?", - (scope, prefix + "%"), + "DELETE FROM kv WHERE scope = ? AND key LIKE ? ESCAPE '!'", + (scope, self._escape_like(prefix) + "%"), ) self._conn.commit() return cur.rowcount + def kv_list_prefix(self, scope: str, prefix: str) -> list[tuple[str, str]]: + """Return (key, value) pairs for all keys under scope that start with prefix.""" + rows = self._conn.execute( + "SELECT key, value FROM kv WHERE scope = ? AND key LIKE ? ESCAPE '!' ORDER BY key", + (scope, self._escape_like(prefix) + "%"), + ).fetchall() + return [(r["key"], r["value"]) for r in rows] + # -- Session (de)serialization ------------------------------------------ diff --git a/src/lang2sql/core/identity.py b/src/lang2sql/core/identity.py index f527991..79037a2 100644 --- a/src/lang2sql/core/identity.py +++ b/src/lang2sql/core/identity.py @@ -81,6 +81,16 @@ def scope_chain(self) -> list[Scope]: chain.append(Scope(ScopeLevel.BUILTIN, "")) return chain + @property + def kv_scope(self) -> str: + """KV store namespace key for this identity's guild (or DM fallback).""" + return self.guild_id or f"dm:{self.user_id}" + + @property + def effective_channel_id(self) -> str: + """Channel entity for KV lookups: thread > channel > empty.""" + return self.thread_id or self.channel_id or "" + def default_write_scope(self) -> Scope: """Where a new definition lands when the user gives no ``--scope``. diff --git a/src/lang2sql/core/ports/semantic_scope.py b/src/lang2sql/core/ports/semantic_scope.py index 71c746c..8d8fd37 100644 --- a/src/lang2sql/core/ports/semantic_scope.py +++ b/src/lang2sql/core/ports/semantic_scope.py @@ -8,20 +8,19 @@ from __future__ import annotations -from typing import TYPE_CHECKING, Protocol, runtime_checkable +from typing import TYPE_CHECKING, Any, Protocol, runtime_checkable from ..identity import Identity, Scope if TYPE_CHECKING: from ...semantic.types import SemanticEntry - from ...semantic.layer import SemanticLayer @runtime_checkable class ScopeResolverPort(Protocol): """Resolve the effective semantic layer for an identity's scope chain.""" - async def effective_layer(self, identity: Identity) -> "SemanticLayer": + async def effective_layer(self, identity: Identity) -> Any: """Merge scopes narrow→wide so the most specific definition wins.""" ... diff --git a/src/lang2sql/frontends/discord/bot.py b/src/lang2sql/frontends/discord/bot.py index edace0f..354036e 100644 --- a/src/lang2sql/frontends/discord/bot.py +++ b/src/lang2sql/frontends/discord/bot.py @@ -128,15 +128,6 @@ async def connect(interaction: discord.Interaction, dsn: str) -> None: async def ingest(interaction: discord.Interaction, ref: str) -> None: await self._run(interaction, handlers.ingest(to_identity(_interaction_context(interaction)), ref=ref)) - @tree.command(name="define_metric", description='Define a metric: name and "definition"') - async def define_metric( - interaction: discord.Interaction, name: str, definition: str - ) -> None: - await self._run( - interaction, - handlers.define_metric(to_identity(_interaction_context(interaction)), name, definition), - ) - @tree.command(name="remember", description="Remember a fact for future turns") async def remember(interaction: discord.Interaction, text: str) -> None: await self._run(interaction, handlers.remember(to_identity(_interaction_context(interaction)), text)) @@ -148,9 +139,33 @@ async def enrich(interaction: discord.Interaction, table: str = "", clear: bool handlers.enrich(to_identity(_interaction_context(interaction)), table=table, clear=clear), ) - @tree.command(name="semantic_show", description="Show definitions in effect here") - async def semantic_show(interaction: discord.Interaction) -> None: - await self._run(interaction, handlers.semantic_show(to_identity(_interaction_context(interaction)))) + @tree.command(name="term_custom", description="비즈니스 용어 등록·조회·삭제 (action: show / remove, term: 용어명)") + async def term_custom( + interaction: discord.Interaction, + action: str = "", + term: str = "", + layer: str = "member", + ) -> None: + ident = to_identity(_interaction_context(interaction)) + if action == "show": + await self._run(interaction, handlers.term_custom(ident, list_all=True)) + elif action == "remove": + await self._run(interaction, handlers.term_custom(ident, term=term, layer=layer, remove=True)) + else: + from .term_wizard import start_term_add_flow + await start_term_add_flow(interaction, handlers, _interaction_context) + + @tree.command(name="org_setup", description="조직(전사) 또는 팀(채널) 등록 + DB 스캔으로 비즈니스 용어 자동 추출") + async def org_setup( + interaction: discord.Interaction, + org: str = "", + team: str = "", + clear: bool = False, + ) -> None: + await self._run( + interaction, + handlers.org_setup(to_identity(_interaction_context(interaction)), org=org, team=team, clear=clear), + ) @tree.command(name="audit_me", description="Show your recent activity") async def audit_me(interaction: discord.Interaction) -> None: @@ -187,7 +202,10 @@ async def on_message(self, message: discord.Message) -> None: content, file = _to_sendable(out) if content and len(content) > 1900: content = content[:1900] + "\n…(truncated)" - await message.channel.send(content=content or "(empty)", file=file) + kwargs: dict = {"content": content or "(empty)"} + if file is not None: + kwargs["file"] = file + await message.channel.send(**kwargs) except Exception as exc: import traceback traceback.print_exc() diff --git a/src/lang2sql/frontends/discord/commands.py b/src/lang2sql/frontends/discord/commands.py index 3c1edb4..fbd7f5b 100644 --- a/src/lang2sql/frontends/discord/commands.py +++ b/src/lang2sql/frontends/discord/commands.py @@ -76,43 +76,12 @@ async def query(self, identity: Identity, text: str) -> OutboundMessage: suffix += "\n\n**결과:**\n```\n" + "\n\n".join(sql_results) + "\n```" return render_answer(answer + suffix) - async def define_metric( - self, - identity: Identity, - name: str, - definition: str, - scope: str | None = None, - ) -> OutboundMessage: - """Register one definition at the current (or requested) scope. - - Delegates to the ctx-aware ``define_metric`` tool, which writes through - the :class:`ScopeResolverPort` and records an audit event. ``scope`` is - ``"channel"`` (default) or ``"guild"`` per the tool's schema. - """ - ctx = await self._concierge.build_context(identity) - args: dict[str, str] = {"name": name, "definition": definition} - if scope: - args["scope"] = scope - result = await ctx.tools.dispatch("define_metric", args, ctx, "cmd:define_metric") - return OutboundMessage(text=result.content) - async def remember(self, identity: Identity, text: str) -> OutboundMessage: """Persist a user fact via the memory service (manual ``/remember``).""" ctx = await self._concierge.build_context(identity) result = await ctx.tools.dispatch("remember", {"text": text}, ctx, "cmd:remember") return OutboundMessage(text=result.content) - async def semantic_show(self, identity: Identity) -> OutboundMessage: - """Show the effective semantic layer for this scope chain.""" - ctx = await self._concierge.build_context(identity) - if ctx.scope_resolver is None: - return OutboundMessage(text="Semantic layer unavailable.") - layer = await ctx.scope_resolver.effective_layer(identity) - rendered = layer.render() - if not rendered: - return OutboundMessage(text="No definitions apply in this scope yet.") - return OutboundMessage(text=f"Definitions in effect here:\n{rendered}") - async def audit_me(self, identity: Identity) -> OutboundMessage: """List the caller's recent audited actions, newest first.""" ctx = await self._concierge.build_context(identity) @@ -165,7 +134,7 @@ async def register_db_for_guild( ) ) - scope = identity.guild_id or f"dm:{identity.user_id}" + scope = identity.kv_scope await self._concierge.secrets.set(scope, "db_dsn", spec.dsn) for k, v in spec.extras.items(): await self._concierge.secrets.set(scope, f"db_extras.{k}", v) @@ -175,7 +144,7 @@ async def register_db_for_guild( return OutboundMessage( text=( f"✅ Connected to **{db_type}** — found **{len(tables)} table(s)**. " - "Your credentials are stored encrypted; you can `/semantic_show` " + "Your credentials are stored encrypted; you can `/term_custom action:show` " "or just ask a question now." ) ) @@ -188,6 +157,42 @@ async def enrich(self, identity: Identity, table: str = "", clear: bool = False) ) return OutboundMessage(text=result.content) + async def org_setup( + self, identity: Identity, org: str = "", team: str = "", clear: bool = False + ) -> OutboundMessage: + """조직(전사) 또는 팀(채널) 등록 + DB 스캔으로 비즈니스 용어 자동 추출.""" + ctx = await self._concierge.build_context(identity) + result = await ctx.tools.dispatch( + "org_setup", {"org": org, "team": team, "clear": clear}, ctx, "cmd:org_setup" + ) + return OutboundMessage(text=result.content) + + async def term_custom( + self, + identity: Identity, + term: str = "", + definition: str = "", + layer: str = "member", + synonyms: str = "", + inferred: bool = False, + scan: bool = False, + remove: bool = False, + list_all: bool = False, + ) -> OutboundMessage: + """채널(팀)/전사/개인 계층 비즈니스 용어 사전 관리.""" + ctx = await self._concierge.build_context(identity) + result = await ctx.tools.dispatch( + "term_custom", + { + "term": term, "definition": definition, "layer": layer, + "synonyms": synonyms, "inferred": inferred, "scan": scan, + "remove": remove, "list": list_all, + }, + ctx, + "cmd:term_custom", + ) + return OutboundMessage(text=result.content) + async def connect(self, identity: Identity, dsn: str) -> OutboundMessage: """V1 stub: stash a DB DSN keyed by guild/DM in the concierge kv store. @@ -200,7 +205,7 @@ async def connect(self, identity: Identity, dsn: str) -> OutboundMessage: dsn = dsn.strip() if not dsn: return OutboundMessage(text="Provide a database connection string.") - scope = identity.guild_id or f"dm:{identity.user_id}" + scope = identity.kv_scope self._concierge.store.kv_set(scope, "dsn", dsn) return OutboundMessage( text=( diff --git a/src/lang2sql/frontends/discord/term_wizard.py b/src/lang2sql/frontends/discord/term_wizard.py new file mode 100644 index 0000000..eb4cdb8 --- /dev/null +++ b/src/lang2sql/frontends/discord/term_wizard.py @@ -0,0 +1,124 @@ +"""term_wizard.py — /term_custom 등록 폼 (2단계 UI). + +Step 1: Select — 전사(guild) / 채널·팀(channel) / 개인(member) 선택 +Step 2: Modal — 용어명·정의·동의어 입력 + +채널이 팀 경계 역할을 하므로 entity 직접 입력 불필요. +setup_wizard.py 패턴 동일: Select 선택 → Modal 응답. +""" + +from __future__ import annotations + +from typing import TYPE_CHECKING + +import discord +from discord import ui + +from .session_router import to_identity + +if TYPE_CHECKING: + from .commands import CommandHandlers + + +_LAYER_OPTIONS = [ + discord.SelectOption( + label="전사 (Guild) — 회사 공통 정의", + value="guild", + description="모든 채널에서 기본값으로 사용", + ), + discord.SelectOption( + label="채널 (팀) — 이 채널 전용 정의", + value="channel", + description="다른 채널과 충돌 없이 이 채널에서만 유효", + ), + discord.SelectOption( + label="개인 — 나만 사용하는 정의", + value="member", + description="전사·채널 정의를 조용히 덮어씀", + ), +] + + +class _TermModal(ui.Modal, title="비즈니스 용어 등록"): + term = ui.TextInput( + label="용어명", + placeholder="예: 활성고객", + required=True, + max_length=100, + ) + definition = ui.TextInput( + label="정의", + placeholder="예: 최근 30일 내 로그인한 users", + required=True, + style=discord.TextStyle.paragraph, + max_length=500, + ) + synonyms = ui.TextInput( + label="동의어 (쉼표 구분, 선택)", + placeholder="예: active user, 활성화고객", + required=False, + max_length=200, + ) + + def __init__(self, layer: str, handlers: "CommandHandlers", ctx_factory) -> None: + super().__init__() + self._layer = layer + self._handlers = handlers + self._ctx_factory = ctx_factory + + async def on_submit(self, interaction: discord.Interaction) -> None: + await interaction.response.defer(ephemeral=True, thinking=True) + try: + identity = to_identity(self._ctx_factory(interaction)) + result = await self._handlers.term_custom( + identity, + term=self.term.value.strip(), + definition=self.definition.value.strip(), + layer=self._layer, + synonyms=self.synonyms.value.strip(), + ) + await interaction.followup.send(result.text, ephemeral=True) + except Exception as exc: + try: + await interaction.followup.send(f"❌ 오류: {exc}", ephemeral=True) + except Exception: + pass + + +class _LayerSelect(ui.Select): + def __init__(self, handlers: "CommandHandlers", ctx_factory) -> None: + super().__init__( + placeholder="적용 범위를 선택하세요…", + options=_LAYER_OPTIONS, + min_values=1, + max_values=1, + ) + self._handlers = handlers + self._ctx_factory = ctx_factory + + async def callback(self, interaction: discord.Interaction) -> None: + await interaction.response.send_modal( + _TermModal(self.values[0], self._handlers, self._ctx_factory) + ) + + +class _LayerSelectView(ui.View): + def __init__(self, handlers: "CommandHandlers", ctx_factory) -> None: + super().__init__(timeout=120.0) + self.add_item(_LayerSelect(handlers, ctx_factory)) + + +async def start_term_add_flow( + interaction: discord.Interaction, + handlers: "CommandHandlers", + ctx_factory, +) -> None: + """bot.py의 /term_custom 커맨드에서 호출 — 범위 선택 → 용어 등록 모달.""" + await interaction.response.send_message( + "용어를 등록할 **범위**를 선택하세요.\n" + "- **전사**: 모든 채널에서 기본값\n" + "- **채널(팀)**: 이 채널에서만 유효 (다른 채널과 충돌 없음)\n" + "- **개인**: 나만 사용하는 정의 (전사·채널 정의를 덮어씀)", + view=_LayerSelectView(handlers, ctx_factory), + ephemeral=True, + ) diff --git a/src/lang2sql/harness/context.py b/src/lang2sql/harness/context.py index 5266521..d9af9b1 100644 --- a/src/lang2sql/harness/context.py +++ b/src/lang2sql/harness/context.py @@ -19,7 +19,6 @@ from ..core.ports.explorer import ExplorerPort from ..core.ports.llm import LLMPort from ..core.ports.safety import SafetyPipelinePort -from ..core.ports.semantic_scope import ScopeResolverPort from .session import Session from .tool_registry import ToolRegistry @@ -33,6 +32,5 @@ class HarnessContext: explorer: ExplorerPort | None = None safety: SafetyPipelinePort | None = None audit: AuditPort | None = None - scope_resolver: ScopeResolverPort | None = None store: SqliteStore | None = None max_turns: int = 8 diff --git a/src/lang2sql/harness/system_prompt.py b/src/lang2sql/harness/system_prompt.py index 3899a6f..31e70c4 100644 --- a/src/lang2sql/harness/system_prompt.py +++ b/src/lang2sql/harness/system_prompt.py @@ -27,16 +27,10 @@ async def build_system_prompt(ctx: HarnessContext) -> str: parts: list[str] = [_BASE] - if ctx.scope_resolver is not None: - layer = await ctx.scope_resolver.effective_layer(ctx.identity) - rendered = layer.render() if layer is not None else "" - if rendered: - parts.append("## Semantic layer (effective for this scope)\n" + rendered) - if ctx.explorer is not None: tables = await ctx.explorer.list_tables() if tables: - scope = (ctx.identity.guild_id or f"dm:{ctx.identity.user_id}") if ctx.store else None + scope = ctx.identity.kv_scope if ctx.store else None has_enrichment = bool( scope and ctx.store and ctx.store.kv_get(scope, "schema_relationships") @@ -61,7 +55,7 @@ async def build_system_prompt(ctx: HarnessContext) -> str: parts.append("## Known tables\n" + names) if ctx.store is not None: - scope = ctx.identity.guild_id or f"dm:{ctx.identity.user_id}" + scope = ctx.identity.kv_scope raw = ctx.store.kv_get(scope, "schema_relationships") if raw: try: @@ -72,4 +66,11 @@ async def build_system_prompt(ctx: HarnessContext) -> str: except (ValueError, TypeError): pass + from ..tools.semantic_federation import build_prompt_section + user_id = ctx.identity.user_id or "unknown" + channel_id = ctx.identity.effective_channel_id + semfed_section = build_prompt_section(ctx.store, scope, channel_id, user_id) + if semfed_section: + parts.append(semfed_section) + return "\n\n".join(parts) diff --git a/src/lang2sql/semantic/__init__.py b/src/lang2sql/semantic/__init__.py index 110ae6e..d896767 100644 --- a/src/lang2sql/semantic/__init__.py +++ b/src/lang2sql/semantic/__init__.py @@ -1,11 +1,7 @@ -"""Semantic federation layer (★④) — definitions, scoping, composition.""" +"""Semantic federation layer (★④) — type definitions.""" from __future__ import annotations -from .layer import SemanticLayer -from .scoped_layer import merge_scoped -from .sql_composer import SQLComposer -from .store import SemanticStore from .types import ( Dimension, Metric, @@ -16,10 +12,6 @@ ) __all__ = [ - "SemanticLayer", - "merge_scoped", - "SQLComposer", - "SemanticStore", "SemanticEntry", "SemanticKind", "Metric", diff --git a/src/lang2sql/semantic/layer.py b/src/lang2sql/semantic/layer.py deleted file mode 100644 index d82e4c7..0000000 --- a/src/lang2sql/semantic/layer.py +++ /dev/null @@ -1,59 +0,0 @@ -"""The effective semantic layer for one resolved scope (★④). - -A :class:`SemanticLayer` is a flat bag of :class:`SemanticEntry` already -resolved for a particular request — federation merging happens upstream in -:mod:`lang2sql.semantic.scoped_layer`. The harness calls :meth:`render` to -splice these definitions into the system prompt so the model prefers them over -its own assumptions. -""" - -from __future__ import annotations - -from .types import SemanticEntry - - -class SemanticLayer: - """An ordered collection of definitions keyed by ``name``. - - Insertion order is preserved for stable rendering; :meth:`add` replaces an - existing entry of the same ``name`` in place so later authority wins - without reordering. - """ - - def __init__(self, entries: list[SemanticEntry] | None = None) -> None: - self._entries: list[SemanticEntry] = list(entries or []) - - @property - def entries(self) -> list[SemanticEntry]: - return list(self._entries) - - def lookup(self, name: str) -> SemanticEntry | None: - """Return the entry named ``name``, or ``None`` if absent.""" - for entry in self._entries: - if entry.name == name: - return entry - return None - - def add(self, entry: SemanticEntry) -> None: - """Add ``entry``, replacing any existing entry with the same name.""" - for i, existing in enumerate(self._entries): - if existing.name == entry.name: - self._entries[i] = entry - return - self._entries.append(entry) - - def render(self) -> str: - """Markdown bullet list for system-prompt injection. - - Empty string when there are no entries so the caller can skip the - section header entirely. - """ - if not self._entries: - return "" - lines: list[str] = [] - for entry in self._entries: - scope = f" (applies to {entry.applies_to})" if entry.applies_to else "" - lines.append( - f"- **{entry.name}** [{entry.kind.value}]: {entry.definition}{scope}" - ) - return "\n".join(lines) diff --git a/src/lang2sql/semantic/scoped_layer.py b/src/lang2sql/semantic/scoped_layer.py deleted file mode 100644 index 17a1903..0000000 --- a/src/lang2sql/semantic/scoped_layer.py +++ /dev/null @@ -1,33 +0,0 @@ -"""Pure federation merge — most specific scope wins (★④). - -Given each scope's own entries ordered NARROW→WIDE (thread, channel, guild, -builtin), collapse them into one :class:`SemanticLayer`. For any ``name`` the -first (narrowest) definition encountered is authoritative; wider scopes only -contribute names the narrow scopes did not define. No I/O here — the resolver -gathers the per-scope lists and hands them in. -""" - -from __future__ import annotations - -from ..core.identity import Scope -from .layer import SemanticLayer -from .types import SemanticEntry - - -def merge_scoped( - scoped_entries: list[tuple[Scope, list[SemanticEntry]]], -) -> SemanticLayer: - """Merge ordered ``(scope, entries)`` pairs into one effective layer. - - ``scoped_entries`` must run narrow→wide. The first definition seen for a - given ``name`` wins; later (wider) ones for the same name are dropped. - """ - layer = SemanticLayer() - seen: set[str] = set() - for _scope, entries in scoped_entries: - for entry in entries: - if entry.name in seen: - continue - seen.add(entry.name) - layer.add(entry) - return layer diff --git a/src/lang2sql/semantic/sql_composer.py b/src/lang2sql/semantic/sql_composer.py deleted file mode 100644 index 0fa928a..0000000 --- a/src/lang2sql/semantic/sql_composer.py +++ /dev/null @@ -1,27 +0,0 @@ -"""Expand a metric name to its definition (★④, minimal V1). - -V1 deliberately does the simplest useful thing: resolve a name against an -effective :class:`SemanticLayer` and hand back the stored definition string. -Real composition — substituting nested metric references, stitching dimension -filters, emitting a SQL fragment — is later work and is flagged inline. -""" - -from __future__ import annotations - -from .layer import SemanticLayer - - -class SQLComposer: - """Resolve metric/dimension names against a resolved semantic layer.""" - - def __init__(self, layer: SemanticLayer) -> None: - self._layer = layer - - def expand(self, name: str) -> str | None: - """Return the definition string for ``name``, or ``None`` if unknown. - - TODO(v1.5): real composition — recursively expand nested references and - emit a SQL fragment rather than the raw stored definition. - """ - entry = self._layer.lookup(name) - return entry.definition if entry is not None else None diff --git a/src/lang2sql/semantic/store.py b/src/lang2sql/semantic/store.py deleted file mode 100644 index 06063d4..0000000 --- a/src/lang2sql/semantic/store.py +++ /dev/null @@ -1,32 +0,0 @@ -"""In-memory storage of semantic definitions, keyed by scope (★④). - -V1 keeps everything in a process-local dict — SQLite persistence is deferred -(v1.5). The store is intentionally dumb: it knows nothing about inheritance or -resolution, only "here are the entries authored exactly at this scope". -Resolution lives in :mod:`lang2sql.tenancy.scope_resolver`. -""" - -from __future__ import annotations - -from ..core.identity import Scope -from .types import SemanticEntry - - -class SemanticStore: - """Maps ``str(scope)`` → the entries defined exactly at that scope.""" - - def __init__(self) -> None: - self._by_scope: dict[str, list[SemanticEntry]] = {} - - def add(self, scope: Scope, entry: SemanticEntry) -> None: - """Store ``entry`` at ``scope``, replacing a same-named entry there.""" - bucket = self._by_scope.setdefault(str(scope), []) - for i, existing in enumerate(bucket): - if existing.name == entry.name: - bucket[i] = entry - return - bucket.append(entry) - - def entries_at(self, scope: Scope) -> list[SemanticEntry]: - """Entries authored exactly at ``scope`` (no inheritance).""" - return list(self._by_scope.get(str(scope), [])) diff --git a/src/lang2sql/tenancy/__init__.py b/src/lang2sql/tenancy/__init__.py index 2f339c9..b425070 100644 --- a/src/lang2sql/tenancy/__init__.py +++ b/src/lang2sql/tenancy/__init__.py @@ -1,6 +1,6 @@ -"""Tenancy — scope resolution, per-scope secrets, and context assembly. +"""Tenancy — per-scope secrets and context assembly. -The :class:`ContextConcierge` is the one place concrete semantic/safety/adapter +The :class:`ContextConcierge` is the one place concrete safety/adapter classes get wired together into a :class:`~lang2sql.harness.context.HarnessContext`. """ @@ -8,6 +8,5 @@ from .concierge import ContextConcierge from .encrypted_secrets import EncryptedSecrets -from .scope_resolver import ScopeResolver -__all__ = ["ContextConcierge", "EncryptedSecrets", "ScopeResolver"] +__all__ = ["ContextConcierge", "EncryptedSecrets"] diff --git a/src/lang2sql/tenancy/concierge.py b/src/lang2sql/tenancy/concierge.py index 526eadf..5812865 100644 --- a/src/lang2sql/tenancy/concierge.py +++ b/src/lang2sql/tenancy/concierge.py @@ -18,7 +18,6 @@ from ..adapters.db.postgres_explorer import PostgresExplorer from ..adapters.llm.fake import FakeLLM from ..adapters.llm.openai_ import OpenAILLM -from ..adapters.storage.sqlite_semantic import SqliteSemanticStore from ..adapters.storage.sqlite_store import SqliteStore from ..core.identity import Identity from ..core.ports.audit import AuditPort @@ -26,7 +25,6 @@ from ..core.ports.llm import LLMPort from ..core.ports.safety import SafetyPipelinePort from ..core.ports.secrets import SecretsPort -from ..core.ports.semantic_scope import ScopeResolverPort from ..harness.context import HarnessContext from ..harness.session import Session from ..harness.tool_registry import ToolRegistry @@ -35,7 +33,6 @@ from ..safety.pipeline import SafetyPipeline from ..tools import build_default_tools from .encrypted_secrets import EncryptedSecrets -from .scope_resolver import ScopeResolver # DSN used for the V1 explorer stub when a scope has registered none yet. _DEFAULT_DSN = "postgresql://stub/v1" @@ -52,27 +49,14 @@ def __init__( llm: LLMPort | None = None, explorer: ExplorerPort | None = None, safety: SafetyPipelinePort | None = None, - scope_resolver: ScopeResolverPort | None = None, secrets: SecretsPort | None = None, audit: AuditPort | None = None, max_turns: int = 8, ) -> None: - # ``path`` drives the default persistence backends; ``:memory:`` keeps - # tests isolated, a file path makes sessions/definitions/secrets durable. self._store = store if store is not None else SqliteStore(path) - # Audit + session persistence both ride the one sqlite store by default. self._llm = llm if llm is not None else _default_llm() - # Explorer precedence: explicit injection → env-configured real DB - # (LANG2SQL_DB_URL / Cloudflare D1) → the canned stub for offline dev. self._explorer = explorer or explorer_from_env() or PostgresExplorer(_DEFAULT_DSN) self._safety = safety if safety is not None else SafetyPipeline() - # Persistent semantic store by default so definitions survive restart. - self._scope_resolver = ( - scope_resolver - if scope_resolver is not None - else ScopeResolver(SqliteSemanticStore(path)) - ) - # Secrets share the session/audit store's kv table (and sqlite file). self._secrets = ( secrets if secrets is not None else EncryptedSecrets(self._store) ) @@ -99,11 +83,6 @@ def secrets(self) -> SecretsPort: """Per-scope encrypted credential store (DSNs/API keys via ``/connect``).""" return self._secrets - @property - def scope_resolver(self) -> ScopeResolverPort: - """Federation resolver over the (by default persistent) semantic store.""" - return self._scope_resolver - def forget_explorer(self, scope: str) -> None: """Bust the cached explorer for ``scope`` (call after /setup updates a DSN).""" self._scope_explorers.pop(scope, None) @@ -115,7 +94,7 @@ async def _explorer_for(self, identity: Identity) -> ExplorerPort: secrets), build an explorer from it (cached). Otherwise fall back to the concierge's default explorer (env-configured or stub). """ - scope = identity.guild_id or f"dm:{identity.user_id}" + scope = identity.kv_scope cached = self._scope_explorers.get(scope) if cached is not None: return cached @@ -154,7 +133,6 @@ async def build_context( explorer=await self._explorer_for(identity), safety=self._safety, audit=self._audit, - scope_resolver=self._scope_resolver, store=self._store, max_turns=self._max_turns, ) @@ -169,7 +147,9 @@ def _default_llm() -> LLMPort: api_key = os.environ.get("OPENAI_API_KEY") or "local" url = base_url.rstrip("/") if not url.endswith("/chat/completions"): - url = url + "/v1/chat/completions" + if not url.endswith("/v1"): + url = url + "/v1" + url = url + "/chat/completions" return OpenAILLM(model=model, api_key=api_key, base_url=url) if os.environ.get("OPENAI_API_KEY"): return OpenAILLM() diff --git a/src/lang2sql/tenancy/scope_resolver.py b/src/lang2sql/tenancy/scope_resolver.py deleted file mode 100644 index 2093fbf..0000000 --- a/src/lang2sql/tenancy/scope_resolver.py +++ /dev/null @@ -1,42 +0,0 @@ -"""Federation resolution over a :class:`SemanticStore` (★④). - -Implements :class:`~lang2sql.core.ports.semantic_scope.ScopeResolverPort`. The -store holds raw per-scope definitions; this resolver walks an identity's -``scope_chain()`` (narrow→wide) and merges them so the most specific definition -of each ``name`` wins. ``define``/``entries_at`` delegate straight to the store. -""" - -from __future__ import annotations - -from ..core.identity import Identity, Scope -from ..semantic.layer import SemanticLayer -from ..semantic.scoped_layer import merge_scoped -from ..semantic.store import SemanticStore -from ..semantic.types import SemanticEntry - - -class ScopeResolver: - """Resolve effective semantic layers, backed by a :class:`SemanticStore`.""" - - def __init__(self, store: SemanticStore | None = None) -> None: - self._store = store if store is not None else SemanticStore() - - @property - def store(self) -> SemanticStore: - return self._store - - async def effective_layer(self, identity: Identity) -> SemanticLayer: - """Merge ``identity``'s scope chain narrow→wide; most specific wins.""" - scoped = [ - (scope, self._store.entries_at(scope)) - for scope in identity.scope_chain() - ] - return merge_scoped(scoped) - - async def define(self, scope: Scope, entry: SemanticEntry) -> None: - """Persist one definition at an explicit scope.""" - self._store.add(scope, entry) - - async def entries_at(self, scope: Scope) -> list[SemanticEntry]: - """Definitions stored exactly at ``scope`` (no inheritance).""" - return self._store.entries_at(scope) diff --git a/src/lang2sql/tools/__init__.py b/src/lang2sql/tools/__init__.py index cbd2e8f..b8d0813 100644 --- a/src/lang2sql/tools/__init__.py +++ b/src/lang2sql/tools/__init__.py @@ -13,16 +13,18 @@ from ..ingestion.pipeline import IngestionPipeline from ..memory.service import MemoryService from .ask_user import AskUser -from .define_metric import DefineMetric from .enrich_schema import EnrichSchema from .explore_schema import ExploreSchema from .ingest_doc import IngestDoc +from .org_setup import OrgSetupTool from .remember import Remember from .run_sql import RunSQL +from .semantic_federation import SemanticFederationTool __all__ = [ "build_default_tools", - "RunSQL", "ExploreSchema", "EnrichSchema", "DefineMetric", "Remember", "AskUser", "IngestDoc", + "RunSQL", "ExploreSchema", "EnrichSchema", "SemanticFederationTool", + "OrgSetupTool", "Remember", "AskUser", "IngestDoc", ] @@ -38,7 +40,8 @@ def build_default_tools( RunSQL(), ExploreSchema(), EnrichSchema(), - DefineMetric(), + SemanticFederationTool(), + OrgSetupTool(), AskUser(), Remember(memory), IngestDoc(ingestion, source, extractor), diff --git a/src/lang2sql/tools/define_metric.py b/src/lang2sql/tools/define_metric.py deleted file mode 100644 index 68dd546..0000000 --- a/src/lang2sql/tools/define_metric.py +++ /dev/null @@ -1,70 +0,0 @@ -"""define_metric — scope-aware definition writer (★④ federation). - -Writes one :class:`SemanticEntry` to a federation scope via the -:class:`ScopeResolverPort`. With no explicit scope it lands at the identity's -default write scope (current channel), so ``#marketing`` and ``#finance`` can -hold different definitions of the same name without conflict. -""" - -from __future__ import annotations - -from typing import TYPE_CHECKING, Any - -from ..core.ports.audit import AuditEvent -from ..core.identity import Scope, ScopeLevel -from ..core.types import ToolResult, ToolSpec -from ..semantic.types import SemanticEntry, SemanticKind - -if TYPE_CHECKING: - from ..harness.context import HarnessContext - - -class DefineMetric: - @property - def spec(self) -> ToolSpec: - return ToolSpec( - name="define_metric", - description=( - "Define a metric/dimension/rule for the current scope (channel by " - "default). Later questions in this scope use this definition." - ), - parameters={ - "type": "object", - "properties": { - "name": {"type": "string"}, - "definition": {"type": "string"}, - "kind": {"type": "string", "enum": ["metric", "dimension", "rule"]}, - "scope": {"type": "string", "enum": ["channel", "guild"], - "description": "where to store it; default channel"}, - }, - "required": ["name", "definition"], - }, - ) - - async def run(self, args: dict[str, Any], ctx: "HarnessContext") -> ToolResult: - if ctx.scope_resolver is None: - return ToolResult(call_id="", content="semantic layer unavailable", is_error=True) - - name = (args.get("name") or "").strip() - definition = (args.get("definition") or "").strip() - if not name or not definition: - return ToolResult(call_id="", content="name and definition are required", is_error=True) - - kind = SemanticKind(args.get("kind", "metric")) - scope = self._resolve_scope(args.get("scope"), ctx) - entry = SemanticEntry(kind=kind, name=name, definition=definition, - created_by=ctx.identity.user_id) - await ctx.scope_resolver.define(scope, entry) - - if ctx.audit is not None: - await ctx.audit.record( - AuditEvent(actor=ctx.identity.user_id, action="define_metric", - scope=str(scope), detail={"name": name, "kind": kind.value}) - ) - return ToolResult(call_id="", content=f"✅ {kind.value} '{name}' defined at {scope}.") - - @staticmethod - def _resolve_scope(requested: str | None, ctx: "HarnessContext") -> Scope: - if requested == "guild" and ctx.identity.guild_id: - return Scope(ScopeLevel.GUILD, ctx.identity.guild_id) - return ctx.identity.default_write_scope() diff --git a/src/lang2sql/tools/enrich_schema.py b/src/lang2sql/tools/enrich_schema.py index 7898b20..f137398 100644 --- a/src/lang2sql/tools/enrich_schema.py +++ b/src/lang2sql/tools/enrich_schema.py @@ -89,7 +89,7 @@ async def run(self, args: dict[str, Any], ctx: "HarnessContext") -> ToolResult: if ctx.store is None: return ToolResult(call_id="", content="KV store를 사용할 수 없습니다.", is_error=True) - scope = ctx.identity.guild_id or f"dm:{ctx.identity.user_id}" + scope = ctx.identity.kv_scope if args.get("clear"): count = ctx.store.kv_delete_prefix(scope, _KV_PREFIX + ":") diff --git a/src/lang2sql/tools/explore_schema.py b/src/lang2sql/tools/explore_schema.py index 327f3c3..535266b 100644 --- a/src/lang2sql/tools/explore_schema.py +++ b/src/lang2sql/tools/explore_schema.py @@ -22,7 +22,7 @@ def _apply_enrichment_cache(table: Table, ctx: "HarnessContext") -> Table: """Overlay KV-cached descriptions onto columns that lack one.""" if ctx.store is None: return table - scope = ctx.identity.guild_id or f"dm:{ctx.identity.user_id}" + scope = ctx.identity.kv_scope enriched_cols: list[Column] = [] for col in table.columns: if col.description: diff --git a/src/lang2sql/tools/org_setup.py b/src/lang2sql/tools/org_setup.py new file mode 100644 index 0000000..7df2668 --- /dev/null +++ b/src/lang2sql/tools/org_setup.py @@ -0,0 +1,231 @@ +"""OrgSetupTool — 팀/조직 등록 + 비즈니스 용어 자동 추출. + +온보딩 2단계 (/setup으로 DB 연결 후 실행): +1. 접근 가능한 테이블 스캔 (팀의 DB 권한 = 팀이 보는 테이블) +2. LLM이 테이블 구조 + 샘플 데이터 분석 → 팀 도메인 + 핵심 용어 추론 +3. 결과를 SemanticFederationTool과 동일한 KV 네임스페이스에 저장 + → build_prompt_section()이 자동으로 읽어 시스템 프롬프트에 주입 + +KV 저장: + org:{org_lower} → {"name", "domain", "registered_at"} + team:{team_lower}:{channel_id} → {"name", "domain", "registered_at"} (팀 등록 시) + cterm:{term_lower}:guild → FedEntry JSON (org 전용, guild 레이어) + cterm:{term_lower}:channel:{channel_id} → FedEntry JSON (team 등록 시, channel 레이어) +""" + +from __future__ import annotations + +import json +import re +import time +from typing import TYPE_CHECKING, Any + +from ..core.ports.tool import ToolPort +from ..core.types import Message, Role, ToolResult, ToolSpec +from .semantic_federation import FedEntry, _KV_PREFIX as _SEMFED_PREFIX, _kv_key as _semfed_kv_key + +if TYPE_CHECKING: + from ..harness.context import HarnessContext + +_SAMPLE_LIMIT = 10 +_ORG_PREFIX = "org" +_TEAM_PREFIX = "team" + + +def _build_prompt(org_name: str, schema_block: str) -> str: + return ( + f'이 DB에 접근 권한이 있는 팀/조직은 **"{org_name}"** 입니다.\n' + "아래는 이 팀이 접근 가능한 테이블 스키마와 실제 데이터 샘플입니다.\n\n" + f"{schema_block}\n\n" + "위 데이터를 분석해서 다음을 추론해줘:\n" + "1. 이 팀이 담당하는 업무 도메인 (한 줄)\n" + "2. 이 팀이 자주 사용할 비즈니스 핵심 용어 (최대 10개)\n" + " - 각 용어의 DB 기반 정의 (어느 테이블/컬럼에 해당하는지 포함)\n" + " - 다른 팀에서 다르게 부를 수 있는 동의어/별칭\n\n" + "아래 JSON 형식으로만 응답:\n" + "{\n" + ' "domain": "이 팀의 업무 도메인 한 줄 설명",\n' + ' "terms": [\n' + ' {"term": "용어명", "definition": "DB 기반 정의", "synonyms": ["동의어1", "동의어2"]}\n' + " ]\n" + "}" + ) + + +def _extract_result(text: str) -> tuple[str, list[dict]]: + m = re.search(r"\{.*\}", text, re.DOTALL) + if not m: + return "", [] + try: + data = json.loads(m.group(0)) + domain = data.get("domain", "") if isinstance(data, dict) else "" + terms = data.get("terms", []) if isinstance(data, dict) else [] + if not isinstance(terms, list): + terms = [] + return domain, terms + except (ValueError, TypeError): + return "", [] + + +class OrgSetupTool(ToolPort): + @property + def spec(self) -> ToolSpec: + return ToolSpec( + name="org_setup", + description=( + "조직(전사) 또는 팀(채널) 등록 및 DB 테이블 스캔으로 비즈니스 용어를 자동 추출한다. " + "org만 지정 시 guild 레이어(전사 공통), team 지정 시 channel 레이어(팀 전용)에 저장. " + "DB 연결(/setup) 후 실행." + ), + parameters={ + "type": "object", + "properties": { + "org": { + "type": "string", + "description": "전사 조직 이름 (예: ACME). 전사 공통 용어를 guild 레이어에 저장.", + }, + "team": { + "type": "string", + "description": "팀 이름 (예: 마케팅팀). 현재 채널에 팀 전용 용어를 channel 레이어에 저장. org 없이 단독 사용 가능.", + }, + "clear": { + "type": "boolean", + "description": "true이면 해당 org/team의 자동 추출 용어 초기화 (수동 등록 용어 보존)", + }, + }, + }, + ) + + async def run(self, args: dict[str, Any], ctx: "HarnessContext") -> ToolResult: + if ctx.store is None: + return ToolResult(call_id="", content="❌ KV store 미설정", is_error=True) + + org_name = str(args.get("org", "")).strip() + team_name = str(args.get("team", "")).strip() + + if not org_name and not team_name: + return ToolResult(call_id="", content="❌ org 또는 team 파라미터가 필요합니다.", is_error=True) + + scope = ctx.identity.kv_scope + channel_id = ctx.identity.effective_channel_id + + # team이 있으면 channel 레이어, org만 있으면 guild 레이어 + use_team = bool(team_name) + layer = "channel" if use_team else "guild" + + if layer == "guild" and not ctx.identity.is_admin: + return ToolResult( + call_id="", + content="❌ 전사(guild) 용어 등록·초기화는 관리자만 가능합니다.", + is_error=True, + ) + entity = channel_id if use_team else "" + display_name = team_name if use_team else org_name + meta_key = ( + f"{_TEAM_PREFIX}:{team_name.lower()}:{channel_id}" + if use_team + else f"{_ORG_PREFIX}:{org_name.lower()}" + ) + + if args.get("clear"): + entries = ctx.store.kv_list_prefix(scope, f"{_SEMFED_PREFIX}:") + deleted = 0 + for key, val in entries: + try: + data = json.loads(val) + except (ValueError, TypeError): + continue + if not data.get("inferred"): + continue + # 레이어와 entity가 일치하는 항목만 삭제 + entry_layer = data.get("layer", "") + entry_entity = data.get("entity", "") + if entry_layer == layer and entry_entity == entity: + ctx.store.kv_delete(scope, key) + deleted += 1 + ctx.store.kv_delete(scope, meta_key) + layer_label = "팀(채널)" if use_team else "전사(guild)" + return ToolResult( + call_id="", + content=f"🗑️ **{display_name}** [{layer_label}] 자동 추출 용어 {deleted}개 초기화 완료 (수동 등록 용어 보존)", + ) + + if ctx.explorer is None: + return ToolResult(call_id="", content="❌ DB가 연결되지 않았습니다 (/setup 먼저).", is_error=True) + + all_tables = await ctx.explorer.list_tables() + if not all_tables: + return ToolResult(call_id="", content="❌ 접근 가능한 테이블이 없습니다.", is_error=True) + + schema_lines: list[str] = [] + for tbl in all_tables: + try: + described = await ctx.explorer.describe_table(tbl.name) + except Exception: + continue + schema_lines.append(f"테이블: {tbl.name}") + for col in described.columns: + try: + sample_sql = ( + f"SELECT DISTINCT {col.name} FROM {tbl.qualified} " + f"WHERE {col.name} IS NOT NULL LIMIT {_SAMPLE_LIMIT}" + ) + rows = await ctx.explorer.execute(sample_sql, _SAMPLE_LIMIT) + samples = [str(r.get(col.name, r.get(list(r.keys())[0], ""))) for r in rows] + except Exception: + samples = [] + sample_str = f" 샘플: {samples}" if samples else "" + schema_lines.append(f"- {col.name} ({col.type}){sample_str}") + schema_lines.append("") + + schema_block = "\n".join(schema_lines) + prompt = _build_prompt(display_name, schema_block) + + completion = await ctx.llm.complete([Message(role=Role.USER, content=prompt)]) + domain, terms = _extract_result(completion.content) + + if not terms: + return ToolResult( + call_id="", + content="LLM이 용어를 추출하지 못했습니다. 다시 시도해주세요.", + is_error=True, + ) + + ctx.store.kv_set( + scope, + meta_key, + json.dumps({"name": display_name, "domain": domain, "registered_at": time.time()}, ensure_ascii=False), + ) + + saved_terms: list[str] = [] + for t in terms: + term = str(t.get("term", "")).strip() + definition = str(t.get("definition", "")).strip() + synonyms = t.get("synonyms", []) + if not isinstance(synonyms, list): + synonyms = [s.strip() for s in str(synonyms).split(",") if s.strip()] + if not term or not definition: + continue + if ":" in term: + continue # `:` 포함 term은 KV 키 파싱을 깨트리므로 건너뜀 + entry = FedEntry( + term=term, layer=layer, entity=entity, + definition=definition, synonyms=synonyms, inferred=True, + ) + kv_key = _semfed_kv_key(term, layer, entity) + ctx.store.kv_set(scope, kv_key, entry.to_json()) + syn_str = f" (= {', '.join(synonyms)})" if synonyms else "" + saved_terms.append(f"- **{term}**{syn_str}: {definition} 🤖") + + layer_label = "팀(채널)" if use_team else "전사(guild)" + domain_line = f"📌 도메인: {domain}\n\n" if domain else "" + term_block = "\n".join(saved_terms) + return ToolResult( + call_id="", + content=( + f"✅ **{display_name}** [{layer_label}] 등록 완료 — " + f"테이블 {len(all_tables)}개 스캔, 용어 {len(saved_terms)}개 추출\n\n" + f"{domain_line}" + f"**추출된 용어:**\n{term_block}" + ), + ) diff --git a/src/lang2sql/tools/semantic_federation.py b/src/lang2sql/tools/semantic_federation.py new file mode 100644 index 0000000..b7f53c1 --- /dev/null +++ b/src/lang2sql/tools/semantic_federation.py @@ -0,0 +1,366 @@ +"""SemanticFederation — 채널(팀)/전사(guild)/개인(member) 계층 비즈니스 용어 사전. + +계층 우선순위 (narrow → wide): member > channel > guild +- guild : 전사 공통 정의 (회사 전체, /org_setup이 자동 채움) +- channel: 이 채널/팀 전용 정의 (다른 채널과 충돌 없음 — 채널이 격리 경계) +- member : 개인 오버라이드 (조용히 상위 정의를 덮어씀) + +KV 키 구조 (모두 guild scope에 저장): + cterm:{term_lower}:guild → 전사 공통 + cterm:{term_lower}:channel:{ch_id} → 채널(팀) 전용 + cterm:{term_lower}:member:{user_id} → 개인 +""" + +from __future__ import annotations + +import json +import re +from dataclasses import dataclass, field +from typing import TYPE_CHECKING, Any + +from ..core.ports.audit import AuditEvent + +from ..core.ports.tool import ToolPort, ToolResult, ToolSpec + +if TYPE_CHECKING: + from ..harness.context import HarnessContext + +_KV_PREFIX = "cterm" +_LAYERS = ("guild", "channel", "member") + +from ..tools.enrich_schema import _KV_PREFIX as _ENRICH_PREFIX, _KV_RELATIONSHIPS as _ENRICH_RELATIONSHIPS + +_AMBIGUITY_SIGNALS: dict[str, str] = { + r"(^|_)(created|registered|joined|signup)(_at|_date)?$": "신규/최초 가입 기준 용어", + r"(^|_)(last|latest|recent)_(login|visit|active|seen|access)(_at|_date)?$": "활성화 기준 용어", + r"(^|_)(first|initial)_(order|purchase|buy)(_at|_date)?$": "첫 구매 기준 용어", + r"(^|_)(status|state|type|category|tier|grade|rank|segment)$": "상태/분류 기반 용어", + r"(^|_)(score|point|level|rating)$": "점수/등급 기반 용어", + r"(^|_)(is_|has_|can_).+": "boolean 조건 기반 용어", +} + + +def _kv_key(term: str, layer: str, entity: str) -> str: + base = f"{_KV_PREFIX}:{term.strip().lower()}:{layer}" + if layer == "guild": + return base + return f"{base}:{entity.strip().lower()}" + + +@dataclass +class FedEntry: + term: str + layer: str # guild | channel | member + entity: str # channel_id (channel layer), user_id (member layer), "" (guild layer) + definition: str + synonyms: list[str] = field(default_factory=list) + inferred: bool = False + + def __post_init__(self) -> None: + if not isinstance(self.synonyms, list): + self.synonyms = [s.strip() for s in str(self.synonyms).split(",") if s.strip()] + + def to_json(self) -> str: + return json.dumps( + { + "term": self.term, "layer": self.layer, "entity": self.entity, + "definition": self.definition, "synonyms": self.synonyms, + "inferred": self.inferred, + }, + ensure_ascii=False, + ) + + @staticmethod + def from_json(raw: str) -> "FedEntry": + d = json.loads(raw) + return FedEntry( + term=d["term"], layer=d["layer"], entity=d.get("entity", ""), + definition=d["definition"], synonyms=d.get("synonyms", []), + inferred=d.get("inferred", False), + ) + + +class SemanticFederationTool(ToolPort): + @property + def spec(self) -> ToolSpec: + return ToolSpec( + name="term_custom", + description=( + "비즈니스 용어 사전 관리. " + "layer=guild(전사)/channel(이 채널·팀)/member(개인). " + "lookup은 narrow→wide: member > channel > guild. " + "list=true로 전체 조회. remove=true로 삭제." + ), + parameters={ + "type": "object", + "properties": { + "term": { + "type": "string", + "description": "정식 용어명 (예: 활성고객)", + }, + "definition": { + "type": "string", + "description": "DB 컨텍스트에서의 정의 (예: 최근 30일 로그인한 users)", + }, + "layer": { + "type": "string", + "enum": ["guild", "channel", "member"], + "description": "등록 범위. guild=전사 공통, channel=이 채널(팀), member=개인(기본값)", + }, + "synonyms": { + "type": "string", + "description": "쉼표 구분 동의어 (예: active_user,활성화고객)", + }, + "inferred": { + "type": "boolean", + "description": "true 시 LLM 추론 임시 정의로 표시. 사용자 확인 후 재등록 권장.", + }, + "scan": { + "type": "boolean", + "description": "true 시 enriched schema에서 모호 용어 후보 탐색.", + }, + "remove": { + "type": "boolean", + "description": "true 시 해당 term+layer 삭제", + }, + "list": { + "type": "boolean", + "description": "true 시 현재 채널 기준 유효 용어 목록 반환", + }, + }, + }, + ) + + async def run(self, args: dict[str, Any], ctx: "HarnessContext") -> ToolResult: + if ctx.store is None: + return ToolResult(call_id="", content="❌ KV store 미설정", is_error=True) + + scope = ctx.identity.kv_scope + user_id = ctx.identity.user_id or "unknown" + channel_id = ctx.identity.effective_channel_id + + if args.get("list"): + return ToolResult(call_id="", content=_render_effective(ctx.store, scope, channel_id, user_id)) + + if args.get("scan"): + return ToolResult(call_id="", content=_scan_schema(ctx.store, scope)) + + term = str(args.get("term", "")).strip() + if not term: + return ToolResult(call_id="", content="❌ term 파라미터가 필요합니다.", is_error=True) + if ":" in term: + return ToolResult(call_id="", content="❌ term에 ':'를 사용할 수 없습니다.", is_error=True) + + layer = str(args.get("layer", "member")).strip().lower() + if layer not in _LAYERS: + return ToolResult( + call_id="", + content=f"❌ layer는 {list(_LAYERS)} 중 하나여야 합니다.", + is_error=True, + ) + + entity = "" if layer == "guild" else (user_id if layer == "member" else channel_id) + key = _kv_key(term, layer, entity) + + if args.get("remove"): + # 세 scope 전체를 스캔해서 존재하는 항목 모두 삭제 + deleted_tags: list[str] = [] + for lyr, ent in [("guild", ""), ("channel", channel_id), ("member", user_id)]: + k = _kv_key(term, lyr, ent) + if ctx.store.kv_get(scope, k) is not None: + ctx.store.kv_delete(scope, k) + deleted_tags.append(_layer_tag(lyr, ent, user_id, channel_id)) + if not deleted_tags: + return ToolResult(call_id="", content=f"⚠️ **{term}** — 등록된 정의가 없습니다.") + return ToolResult(call_id="", content=f"🗑️ **{term}** [{', '.join(deleted_tags)}] 삭제") + + definition = str(args.get("definition", "")).strip() + if not definition: + return ToolResult(call_id="", content="❌ definition 파라미터가 필요합니다.", is_error=True) + + raw_syns = str(args.get("synonyms", "")).strip() + synonyms = [s.strip() for s in raw_syns.split(",") if s.strip()] if raw_syns else [] + inferred = bool(args.get("inferred", False)) + + entry = FedEntry(term=term, layer=layer, entity=entity, + definition=definition, synonyms=synonyms, inferred=inferred) + ctx.store.kv_set(scope, key, entry.to_json()) + if ctx.audit is not None: + await ctx.audit.record( + AuditEvent(actor=user_id, action="term_custom", + scope=scope, detail={"term": term, "layer": layer}) + ) + + tag = _layer_tag(layer, entity, user_id, channel_id) + syn_str = f" (= {', '.join(synonyms)})" if synonyms else "" + inferred_badge = " 🤖추론" if inferred else "" + return ToolResult( + call_id="", + content=f"✅ **{term}** [{tag}]{syn_str}{inferred_badge}: {definition}", + ) + + +# --------------------------------------------------------------------------- +# Helpers +# --------------------------------------------------------------------------- + +def _layer_tag(layer: str, entity: str, user_id: str, channel_id: str) -> str: + if layer == "guild": + return "전사" + if layer == "channel": + return f"채널:{channel_id}" + return f"개인:{user_id}" + + +# --------------------------------------------------------------------------- +# Schema scan +# --------------------------------------------------------------------------- + +def _scan_schema(store: Any, scope: str) -> str: + col_entries = store.kv_list_prefix(scope, _ENRICH_PREFIX + ":") + if not col_entries: + return "⚠️ enriched schema가 없습니다. 먼저 `/enrich`를 실행해 스키마를 보강하세요." + + col_map: dict[str, str] = {} + for key, desc in col_entries: + parts = key.split(":", 2) + if len(parts) == 3: + col_map[f"{parts[1]}.{parts[2]}"] = desc + + raw_rels = store.kv_get(scope, _ENRICH_RELATIONSHIPS) + relationships: list[str] = json.loads(raw_rels) if raw_rels else [] + + candidates: dict[str, list[tuple[str, str, str]]] = {} + for col_key, desc in col_map.items(): + table, col = col_key.split(".", 1) + for pattern, signal_type in _AMBIGUITY_SIGNALS.items(): + if re.search(pattern, col, re.IGNORECASE): + candidates.setdefault(signal_type, []).append((table, col, desc)) + break + + if not candidates: + return f"스키마에서 모호 용어를 암시하는 컬럼을 찾지 못했습니다. (스캔한 컬럼 수: {len(col_map)}개)" + + lines = [ + "## Business Terminology — 스키마 스캔 결과\n", + "다음 컬럼들이 모호한 비즈니스 용어를 암시합니다.", + "각 항목에 대해 term_custom 등록 여부를 사용자에게 확인하세요.\n", + ] + for signal_type, cols in candidates.items(): + lines.append(f"### {signal_type}") + for table, col, desc in cols: + desc_str = f" — {desc}" if desc else "" + lines.append(f"- `{table}.{col}`{desc_str}") + lines.append("") + + if relationships: + lines.append("### 테이블 관계 (참고)") + for rel in relationships: + lines.append(f"- {rel}") + lines.append("") + + lines.append( + "---\n위 컬럼을 바탕으로 모호 용어 정의를 추론하고 `term_custom` 툴로 `inferred=true` 등록하거나, " + "사용자에게 어느 범위(guild/channel/member)로 등록할지 확인하세요." + ) + return "\n".join(lines) + + +# --------------------------------------------------------------------------- +# System-prompt helpers +# --------------------------------------------------------------------------- + +def _load_all(store: Any, scope: str) -> dict[str, list[FedEntry]]: + """KV에서 모든 cterm 엔트리를 {term_lower: [FedEntry]} 로 반환.""" + raw = store.kv_list_prefix(scope, _KV_PREFIX + ":") + by_term: dict[str, list[FedEntry]] = {} + for key, val in raw: + # cterm:{term}:guild or cterm:{term}:channel:{id} or cterm:{term}:member:{id} + parts = key.split(":", 3) + if len(parts) < 3: + continue + layer = parts[2] + if layer not in _LAYERS: + continue + try: + entry = FedEntry.from_json(val) + except (ValueError, KeyError): + continue + by_term.setdefault(parts[1], []).append(entry) + return by_term + + +def build_prompt_section(store: Any, scope: str, channel_id: str, user_id: str) -> str: + """현재 채널 기준 narrow→wide lookup 용어 섹션 + 모호 용어 지침 반환.""" + by_term = _load_all(store, scope) + + if not by_term: + return _AMBIGUOUS_TERM_POLICY + + lines: list[str] = [] + for term_lower in sorted(by_term): + line = _resolve_term(by_term[term_lower], channel_id, user_id) + if line: + lines.append(line) + + header = ( + "## Business Terminology\n" + "(lookup 우선순위: 개인 > 채널(팀) > 전사)\n" + ) + body = "\n".join(lines) if lines else "(없음)" + return header + body + "\n\n" + _AMBIGUOUS_TERM_POLICY + + +_AMBIGUOUS_TERM_POLICY = """\ +## Ambiguous Term Policy +사전에 없는 주관적/모호한 표현(예: 활성화고객, 신규고객, 우량고객)을 발견하면: +1. 현재 DB 스키마 컨텍스트에서 가장 합리적인 해석으로 SQL을 작성하고 실행한다. +2. 쿼리 후 사용한 해석을 명시하고, term_custom 등록 여부와 범위(guild/channel/member)를 사용자에게 묻는다. + 예: "'신규고객'을 'users.created_at >= NOW()-30일'로 해석했습니다. 이 정의를 어느 범위로 등록할까요?" +3. 사용자가 범위를 지정하면 term_custom 툴로 즉시 등록한다 (inferred=true). +4. inferred=true 엔트리가 이미 있으면 해당 정의를 우선 사용하되, 사용자에게 확정 여부를 확인한다.\ +""" + + +def _fmt_entry(e: FedEntry, tag: str) -> str: + syns = ", ".join(e.synonyms) + syn_str = f" (= {syns})" if syns else "" + inferred_badge = " 🤖" if e.inferred else "" + return f"- **{e.term}** [{tag}]{syn_str}{inferred_badge}: {e.definition}" + + +def _resolve_term(entries: list[FedEntry], channel_id: str, user_id: str) -> str: + """narrow→wide lookup: member > channel > guild.""" + # 1. 개인 오버라이드 + for e in entries: + if e.layer == "member" and e.entity.lower() == user_id.lower(): + return _fmt_entry(e, f"개인:{user_id}") + + # 2. 이 채널 정의 + for e in entries: + if e.layer == "channel" and e.entity.lower() == channel_id.lower(): + return _fmt_entry(e, "채널") + + # 3. 전사 공통 + for e in entries: + if e.layer == "guild": + return _fmt_entry(e, "전사") + + return "" + + +def _render_effective(store: Any, scope: str, channel_id: str, user_id: str) -> str: + """Discord /term_custom list 응답 — 현재 채널 기준 유효 용어 목록.""" + by_term = _load_all(store, scope) + if not by_term: + return "등록된 용어가 없습니다.\n`/term_custom`으로 용어를 추가하세요." + + lines = ["**Business Terminology — 현재 채널 기준 유효 정의**\n"] + for term_lower in sorted(by_term): + line = _resolve_term(by_term[term_lower], channel_id, user_id) + if line: + lines.append(line) + + if len(lines) == 1: + lines.append("(이 채널에 적용되는 용어 정의가 없습니다)") + return "\n".join(lines) diff --git a/tests/test_bench_demo.py b/tests/test_bench_demo.py index 9a2e625..0af6c9e 100644 --- a/tests/test_bench_demo.py +++ b/tests/test_bench_demo.py @@ -1,8 +1,9 @@ """Smoke test: the bench demo runs clean and shows its headline behaviours. Guards the study-group demo against drift in the modules it exercises -(ContextConcierge, ScopeResolver, SafetyPipeline). Runs the demo's ``main()`` -in-process and asserts the federation + safety claims it prints are real. +(ContextConcierge, KV-backed federation, SafetyPipeline). Runs the demo's +``main()`` in-process and asserts the federation + safety claims it prints +are real. """ from __future__ import annotations @@ -11,8 +12,6 @@ import importlib.util from pathlib import Path -import pytest - _DEMO = Path(__file__).resolve().parent.parent / "bench" / "ecommerce_demo.py" @@ -40,22 +39,22 @@ def test_demo_runs_clean(capsys): def test_demo_federation_resolves_distinct_definitions(): - """Reach into the demo's building blocks directly (no printing).""" + """Reach into the demo's KV building blocks directly (no printing).""" demo = _load_demo() - from lang2sql.semantic.types import Metric - from lang2sql.tenancy.scope_resolver import ScopeResolver - - async def _run(): - resolver = ScopeResolver() - mkt = demo._marketing_identity() - fin = demo._finance_identity() - await resolver.define(mkt.default_write_scope(), Metric("active_user", "30d login")) - await resolver.define(fin.default_write_scope(), Metric("active_user", "paid sub")) - mkt_layer = await resolver.effective_layer(mkt) - fin_layer = await resolver.effective_layer(fin) - return mkt_layer.lookup("active_user").definition, fin_layer.lookup("active_user").definition - - mkt_def, fin_def = asyncio.run(_run()) - assert mkt_def == "30d login" - assert fin_def == "paid sub" - assert mkt_def != fin_def + from lang2sql.adapters.storage.sqlite_store import SqliteStore + from lang2sql.tools.semantic_federation import _render_effective + + store = SqliteStore() + mkt = demo._marketing_identity() + fin = demo._finance_identity() + + demo._define_term(store, demo.GUILD, "active_user", "channel", demo.CH_MARKETING, "30d login") + demo._define_term(store, demo.GUILD, "active_user", "channel", demo.CH_FINANCE, "paid sub") + + mkt_rendered = _render_effective(store, demo.GUILD, demo.CH_MARKETING, mkt.user_id) + fin_rendered = _render_effective(store, demo.GUILD, demo.CH_FINANCE, fin.user_id) + + assert "30d login" in mkt_rendered + assert "paid sub" not in mkt_rendered + assert "paid sub" in fin_rendered + assert "30d login" not in fin_rendered diff --git a/tests/test_discord.py b/tests/test_discord.py index 0fe8fea..2ac606c 100644 --- a/tests/test_discord.py +++ b/tests/test_discord.py @@ -105,15 +105,15 @@ def test_render_many_text_lines_attaches() -> None: # -- CommandHandlers (real in-memory concierge) --------------------------- -def test_define_metric_then_semantic_show() -> None: +def test_term_custom_then_list() -> None: handlers = CommandHandlers(ContextConcierge()) ident = to_identity( InteractionContext(user_id="u1", guild_id="g1", channel_id="c1") ) async def scenario() -> tuple[str, str]: - defined = await handlers.define_metric(ident, "active_user", "logged in within 30 days") - shown = await handlers.semantic_show(ident) + defined = await handlers.term_custom(ident, term="active_user", definition="logged in within 30 days", layer="channel") + shown = await handlers.term_custom(ident, list_all=True) return defined.text, shown.text defined_text, shown_text = asyncio.run(scenario()) @@ -122,22 +122,22 @@ async def scenario() -> tuple[str, str]: assert "logged in within 30 days" in shown_text -def test_semantic_show_empty_scope() -> None: +def test_term_custom_list_empty_scope() -> None: handlers = CommandHandlers(ContextConcierge()) ident = to_identity(InteractionContext(user_id="solo", guild_id="g9", channel_id="c9")) - shown = asyncio.run(handlers.semantic_show(ident)) - assert "No definitions" in shown.text + shown = asyncio.run(handlers.term_custom(ident, list_all=True)) + assert shown.text # empty scope returns some message -def test_define_metric_is_scope_isolated() -> None: +def test_term_custom_is_scope_isolated() -> None: """A channel definition must not leak into a different channel (federation).""" handlers = CommandHandlers(ContextConcierge()) marketing = to_identity(InteractionContext(user_id="u1", guild_id="g1", channel_id="mkt")) product = to_identity(InteractionContext(user_id="u1", guild_id="g1", channel_id="prd")) async def scenario() -> str: - await handlers.define_metric(marketing, "active_user", "30d login") - return (await handlers.semantic_show(product)).text + await handlers.term_custom(marketing, term="active_user", definition="30d login", layer="channel") + return (await handlers.term_custom(product, list_all=True)).text assert "active_user" not in asyncio.run(scenario()) diff --git a/tests/test_edge_cases.py b/tests/test_edge_cases.py new file mode 100644 index 0000000..1d00ce1 --- /dev/null +++ b/tests/test_edge_cases.py @@ -0,0 +1,82 @@ +"""Edge-case tests for previously uncovered code paths. + +- org_setup synonyms: LLM가 리스트 대신 문자열로 반환해도 올바르게 처리 +- bot.py on_message: file=None일 때 channel.send에 file 인자를 전달하지 않음 +""" + +from __future__ import annotations + +import asyncio +from unittest.mock import AsyncMock, MagicMock, patch + +from lang2sql.core.identity import Identity +from lang2sql.frontends.discord import InteractionContext, to_identity +from lang2sql.frontends.discord.render import OutboundMessage +from lang2sql.tenancy.concierge import ContextConcierge +from lang2sql.tools.semantic_federation import FedEntry, _render_effective + + +# -- org_setup synonyms ------------------------------------------------------- + + +def test_fed_entry_synonyms_as_string_does_not_corrupt_render() -> None: + """FedEntry with synonyms stored as string (LLM slip) must not explode render.""" + from lang2sql.adapters.storage.sqlite_store import SqliteStore + from lang2sql.tools.semantic_federation import _kv_key + + store = SqliteStore() + scope = "g1" + # Simulate org_setup writing a FedEntry where synonyms is a plain string + # (after our fix this should never happen, but ensure rendering is safe either way) + entry_bad = FedEntry(term="active_user", layer="guild", entity="", + definition="30d login", synonyms="활성유저, active") # type: ignore[arg-type] + store.kv_set(scope, _kv_key("active_user", "guild", ""), entry_bad.to_json()) + + # render must not raise and must not output individual characters + rendered = _render_effective(store, scope, "", "u1") + assert "active_user" in rendered + # "활" appearing as a lone character followed by "," would indicate character-join + assert "활, 성" not in rendered + + +def test_org_setup_synonyms_string_normalised() -> None: + """org_setup coerces a string synonyms value to a list before storing.""" + from lang2sql.tools.semantic_federation import _kv_key, FedEntry + from lang2sql.adapters.storage.sqlite_store import SqliteStore + + store = SqliteStore() + scope = "g-test" + + # Apply the same normalisation logic org_setup uses + raw = "활성유저, active user" + synonyms = raw if isinstance(raw, list) else [s.strip() for s in str(raw).split(",") if s.strip()] + entry = FedEntry(term="active_user", layer="guild", entity="", + definition="30d login", synonyms=synonyms, inferred=True) + store.kv_set(scope, _kv_key("active_user", "guild", ""), entry.to_json()) + + rendered = _render_effective(store, scope, "", "u1") + assert "활성유저" in rendered + assert "active user" in rendered + + +# -- bot.py on_message file guard --------------------------------------------- + + +def test_to_sendable_text_only_returns_none_file() -> None: + """Text-only response produces file=None; on_message must not forward that to channel.send.""" + from lang2sql.frontends.discord.bot import _to_sendable + + msg = OutboundMessage(text="42 users", file_bytes=None) + content, file = _to_sendable(msg) + assert content == "42 users" + assert file is None # confirms the guard in on_message is needed / correct + + +def test_on_message_send_kwargs_omit_file_when_none() -> None: + """Verify the kwargs-building pattern omits file when _to_sendable returns None.""" + # Simulate the exact on_message send block + file = None + kwargs: dict = {"content": "42 users"} + if file is not None: + kwargs["file"] = file + assert "file" not in kwargs # file=None must not be forwarded to channel.send diff --git a/tests/test_integration.py b/tests/test_integration.py index 413f2ff..9b58d6c 100644 --- a/tests/test_integration.py +++ b/tests/test_integration.py @@ -13,8 +13,8 @@ from lang2sql.core.identity import Identity from lang2sql.core.ports.safety import SafetyContext, Verdict from lang2sql.tenancy.concierge import ContextConcierge -from lang2sql.tools.define_metric import DefineMetric from lang2sql.tools.run_sql import RunSQL +from lang2sql.tools.semantic_federation import SemanticFederationTool def _ctx(): @@ -26,7 +26,7 @@ def _ctx(): def test_v1_tools_registered(): _, ctx = _ctx() names = {s.name for s in ctx.tools.specs()} - assert names == {"run_sql", "explore_schema", "enrich_schema", "define_metric", "ask_user", "remember", "ingest_doc"} + assert names == {"run_sql", "explore_schema", "enrich_schema", "term_custom", "org_setup", "ask_user", "remember", "ingest_doc"} def test_run_sql_passes_gate_and_returns_rows(): @@ -48,15 +48,26 @@ def test_run_sql_tolerates_bad_limit(): assert not res.is_error # malformed limit must not crash the tool -def test_define_metric_is_scope_local(): +def test_term_custom_is_scope_local(): + from lang2sql.tools.semantic_federation import _render_effective ident, ctx = _ctx() - asyncio.run(DefineMetric().run({"name": "active_user", "definition": "30d login"}, ctx)) - layer = asyncio.run(ctx.scope_resolver.effective_layer(ident)) - assert layer.lookup("active_user") is not None - # a different channel does not see it - other = Identity(user_id="u1", guild_id="g1", channel_id="c-fin") - other_layer = asyncio.run(ctx.scope_resolver.effective_layer(other)) - assert other_layer.lookup("active_user") is None + asyncio.run(SemanticFederationTool().run( + {"term": "active_user", "definition": "30d login", "layer": "channel"}, ctx + )) + rendered = _render_effective(ctx.store, ident.kv_scope, ident.effective_channel_id, ident.user_id) + assert "active_user" in rendered + # a different channel does not see this channel-level definition + other_rendered = _render_effective(ctx.store, ident.kv_scope, "c-fin", "u1") + assert "active_user" not in other_rendered + + +def test_term_custom_emits_audit_event(): + ident, ctx = _ctx() + asyncio.run(SemanticFederationTool().run( + {"term": "revenue", "definition": "gross revenue", "layer": "guild"}, ctx + )) + events = asyncio.run(ctx.audit.query(ident.user_id)) + assert any(e.action == "term_custom" and e.detail.get("term") == "revenue" for e in events) def test_safety_pipeline_on_context(): diff --git a/tests/test_persistence.py b/tests/test_persistence.py index ea8dc79..2b1f559 100644 --- a/tests/test_persistence.py +++ b/tests/test_persistence.py @@ -1,6 +1,6 @@ -"""Persistence tests — SqliteSemanticStore durability + Fernet secrets (★④ §4.1). +"""Persistence tests — KV federation durability + Fernet secrets (★④ §4.1). -Two guarantees under test: (1) semantic definitions and secrets survive a fresh +Two guarantees under test: (1) semantic definitions stored in KV survive a fresh store instance pointed at the same sqlite file, and (2) secrets are stored as real Fernet ciphertext, not recoverable without the key. """ @@ -13,76 +13,45 @@ import pytest from cryptography.fernet import Fernet, InvalidToken -from lang2sql.adapters.storage.sqlite_semantic import SqliteSemanticStore from lang2sql.adapters.storage.sqlite_store import SqliteStore -from lang2sql.core.identity import Identity, Scope, ScopeLevel -from lang2sql.semantic.types import Metric, SemanticKind +from lang2sql.core.identity import Scope, ScopeLevel from lang2sql.tenancy.encrypted_secrets import EncryptedSecrets -from lang2sql.tenancy.scope_resolver import ScopeResolver +from lang2sql.tools.semantic_federation import FedEntry, _kv_key, _render_effective -def test_semantic_store_add_entries_at_round_trip() -> None: - store = SqliteSemanticStore() - scope = Scope(ScopeLevel.CHANNEL, "c1") - store.add(scope, Metric("revenue", "sum of order totals", created_by="u1")) +def test_kv_federation_survives_new_instance(tmp_path) -> None: + db = str(tmp_path / "kv.db") + scope = "g1" - entries = store.entries_at(scope) - assert len(entries) == 1 - assert entries[0].name == "revenue" - assert entries[0].kind is SemanticKind.METRIC - assert entries[0].definition == "sum of order totals" - assert entries[0].created_by == "u1" - # created_at is preserved, not regenerated. - assert entries[0].created_at != "" - - # Re-adding the same name at the same scope replaces it. - store.add(scope, Metric("revenue", "net revenue")) - entries = store.entries_at(scope) - assert len(entries) == 1 - assert entries[0].definition == "net revenue" - - # A different scope is isolated. - assert store.entries_at(Scope(ScopeLevel.CHANNEL, "other")) == [] - - -def test_semantic_store_survives_new_instance_on_same_file(tmp_path) -> None: - db = str(tmp_path / "semantic.db") - scope = Scope(ScopeLevel.GUILD, "g1") - - writer = SqliteSemanticStore(db) - writer.add(scope, Metric("aov", "avg order value", source_id="doc-7")) - created_at = writer.entries_at(scope)[0].created_at + writer = SqliteStore(db) + entry = FedEntry(term="revenue", layer="guild", entity="", definition="sum of order totals") + writer.kv_set(scope, _kv_key("revenue", "guild", ""), entry.to_json()) writer.close() - # A fresh instance on the same path sees the persisted definition. - reader = SqliteSemanticStore(db) - entries = reader.entries_at(scope) - assert len(entries) == 1 - assert entries[0].name == "aov" - assert entries[0].definition == "avg order value" - assert entries[0].source_id == "doc-7" - assert entries[0].created_at == created_at + reader = SqliteStore(db) + rendered = _render_effective(reader, scope, "c1", "u1") + assert "revenue" in rendered + assert "sum of order totals" in rendered reader.close() -def test_scope_resolver_effective_layer_over_sqlite_store() -> None: - store = SqliteSemanticStore() - # Narrow (channel) overrides wide (guild) for the same name. - store.add(Scope(ScopeLevel.GUILD, "g1"), Metric("revenue", "guild-wide def")) - store.add(Scope(ScopeLevel.CHANNEL, "c1"), Metric("revenue", "channel def")) - store.add(Scope(ScopeLevel.GUILD, "g1"), Metric("churn", "guild churn")) +def test_kv_channel_overrides_guild_persisted(tmp_path) -> None: + db = str(tmp_path / "kv.db") + scope = "g1" - resolver = ScopeResolver(store) - identity = Identity(user_id="u1", guild_id="g1", channel_id="c1") - layer = asyncio.run(resolver.effective_layer(identity)) + store = SqliteStore(db) + store.kv_set(scope, _kv_key("active_user", "guild", ""), FedEntry("active_user", "guild", "", "guild def").to_json()) + store.kv_set(scope, _kv_key("active_user", "channel", "c1"), FedEntry("active_user", "channel", "c1", "channel def").to_json()) + store.close() - by_name = {e.name: e.definition for e in layer.entries} - assert by_name["revenue"] == "channel def" # most specific wins - assert by_name["churn"] == "guild churn" # inherited from guild + reader = SqliteStore(db) + rendered = _render_effective(reader, scope, "c1", "u1") + assert "channel def" in rendered + assert "guild def" not in rendered + reader.close() def test_encrypted_secrets_round_trip_and_ciphertext(tmp_path) -> None: - # Explicit key so the test is independent of env / generated state. key = Fernet.generate_key() store = SqliteStore() secrets = EncryptedSecrets(store, key=key) @@ -91,12 +60,10 @@ def test_encrypted_secrets_round_trip_and_ciphertext(tmp_path) -> None: asyncio.run(secrets.set("guild:1", "dsn", "postgresql://u:p@host/db")) assert asyncio.run(secrets.get("guild:1", "dsn")) == "postgresql://u:p@host/db" - # What actually lands in kv is a Fernet token, never the plaintext. blob = store.kv_get("guild:1", "dsn") assert blob is not None assert "postgresql" not in blob assert blob != "postgresql://u:p@host/db" - # It decrypts only with the right key. assert Fernet(key).decrypt(blob.encode("ascii")).decode() == "postgresql://u:p@host/db" asyncio.run(secrets.delete("guild:1", "dsn")) @@ -107,7 +74,6 @@ def test_encrypted_secrets_wrong_key_fails() -> None: store = SqliteStore() asyncio.run(EncryptedSecrets(store, key=Fernet.generate_key()).set("s", "k", "v")) - # A different key cannot decrypt the stored token. attacker = EncryptedSecrets(store, key=Fernet.generate_key()) with pytest.raises(InvalidToken): asyncio.run(attacker.get("s", "k")) @@ -115,14 +81,12 @@ def test_encrypted_secrets_wrong_key_fails() -> None: def test_encrypted_secrets_survive_new_instance_via_persisted_key(tmp_path) -> None: db = str(tmp_path / "secrets.db") - # No env key -> a key is generated and persisted in the kv table. saved = os.environ.pop("LANG2SQL_SECRET_KEY", None) try: store = SqliteStore(db) asyncio.run(EncryptedSecrets(store).set("guild:1", "dsn", "postgresql://x")) store.close() - # A fresh store + secrets on the same file reuses the persisted key. reopened = SqliteStore(db) secrets = EncryptedSecrets(reopened) assert asyncio.run(secrets.get("guild:1", "dsn")) == "postgresql://x" @@ -130,27 +94,3 @@ def test_encrypted_secrets_survive_new_instance_via_persisted_key(tmp_path) -> N finally: if saved is not None: os.environ["LANG2SQL_SECRET_KEY"] = saved - - -def test_concierge_path_persists_definitions_and_secrets(tmp_path) -> None: - from lang2sql.adapters.llm.fake import FakeLLM - from lang2sql.tenancy.concierge import ContextConcierge - - db = str(tmp_path / "concierge.db") - saved = os.environ.pop("LANG2SQL_SECRET_KEY", None) - try: - c1 = ContextConcierge(path=db, llm=FakeLLM()) - scope = Scope(ScopeLevel.CHANNEL, "c1") - asyncio.run(c1.scope_resolver.define(scope, Metric("revenue", "sum totals"))) - asyncio.run(c1.secrets.set("guild:1", "dsn", "postgresql://secret")) - c1.store.close() - c1.scope_resolver.store.close() - - c2 = ContextConcierge(path=db, llm=FakeLLM()) - assert [e.name for e in asyncio.run(c2.scope_resolver.entries_at(scope))] == [ - "revenue" - ] - assert asyncio.run(c2.secrets.get("guild:1", "dsn")) == "postgresql://secret" - finally: - if saved is not None: - os.environ["LANG2SQL_SECRET_KEY"] = saved diff --git a/tests/test_semantic.py b/tests/test_semantic.py index acf0f0c..f24c127 100644 --- a/tests/test_semantic.py +++ b/tests/test_semantic.py @@ -1,115 +1,86 @@ -"""Tests for the semantic federation layer (★④). +"""Tests for KV-backed semantic federation (★④). -Plain functions named ``test_*`` — runnable without pytest via the validation -one-liner in the spawn brief. +Verifies that the KV store correctly implements narrow→wide scope resolution, +matching the original ScopeResolver contract. """ from __future__ import annotations -import asyncio - -from lang2sql.core.identity import Identity, Scope, ScopeLevel -from lang2sql.semantic import ( - Metric, - SemanticEntry, - SemanticKind, - SemanticLayer, - SemanticStore, - merge_scoped, +from lang2sql.adapters.storage.sqlite_store import SqliteStore +from lang2sql.tools.semantic_federation import ( + FedEntry, + _kv_key, + _render_effective, + build_prompt_section, ) -from lang2sql.tenancy.scope_resolver import ScopeResolver - - -def test_render_empty_is_blank() -> None: - assert SemanticLayer().render() == "" - - -def test_render_lists_entries() -> None: - layer = SemanticLayer( - [ - SemanticEntry(SemanticKind.METRIC, "active_user", "30d login"), - SemanticEntry( - SemanticKind.DIMENSION, "region", "billing country", applies_to="users" - ), - ] - ) - rendered = layer.render() - assert "**active_user** [metric]: 30d login" in rendered - assert "**region** [dimension]: billing country (applies to users)" in rendered - # one bullet line per entry - assert len(rendered.splitlines()) == 2 - assert all(line.startswith("- ") for line in rendered.splitlines()) - - -def test_layer_add_replaces_same_name() -> None: - layer = SemanticLayer([SemanticEntry(SemanticKind.METRIC, "rev", "gross")]) - layer.add(SemanticEntry(SemanticKind.METRIC, "rev", "net")) - assert len(layer.entries) == 1 - assert layer.lookup("rev").definition == "net" - - -def test_scoped_merge_narrow_wins() -> None: - """active_user defined at channel must override the guild definition.""" - guild = Scope(ScopeLevel.GUILD, "g1") - channel = Scope(ScopeLevel.CHANNEL, "c1") - merged = merge_scoped( - [ - (channel, [Metric(name="active_user", definition="7d core action")]), - (guild, [Metric(name="active_user", definition="30d login")]), - ] - ) - assert merged.lookup("active_user").definition == "7d core action" - - -def test_scoped_merge_wider_fills_gaps() -> None: - guild = Scope(ScopeLevel.GUILD, "g1") - channel = Scope(ScopeLevel.CHANNEL, "c1") - merged = merge_scoped( - [ - (channel, [Metric(name="active_user", definition="7d")]), - (guild, [Metric(name="revenue", definition="net")]), - ] - ) - assert merged.lookup("active_user").definition == "7d" - assert merged.lookup("revenue").definition == "net" - - -def test_resolver_effective_layer_guild_channel() -> None: - store = SemanticStore() - resolver = ScopeResolver(store) - identity = Identity(user_id="u1", guild_id="g1", channel_id="c1") - - async def scenario() -> SemanticLayer: - await resolver.define( - Scope(ScopeLevel.GUILD, "g1"), - Metric(name="active_user", definition="30d login"), - ) - await resolver.define( - Scope(ScopeLevel.GUILD, "g1"), - Metric(name="revenue", definition="gross"), - ) - # channel override of active_user only - await resolver.define( - Scope(ScopeLevel.CHANNEL, "c1"), - Metric(name="active_user", definition="7d core action"), - ) - return await resolver.effective_layer(identity) - - layer = asyncio.run(scenario()) - assert layer.lookup("active_user").definition == "7d core action" # channel wins - assert layer.lookup("revenue").definition == "gross" # inherited from guild - - -def test_resolver_entries_at_no_inheritance() -> None: - store = SemanticStore() - resolver = ScopeResolver(store) - - async def scenario() -> list[SemanticEntry]: - await resolver.define( - Scope(ScopeLevel.GUILD, "g1"), - Metric(name="revenue", definition="gross"), - ) - # channel has nothing of its own - return await resolver.entries_at(Scope(ScopeLevel.CHANNEL, "c1")) - - assert asyncio.run(scenario()) == [] + + +def _store_with_entries(entries: list[tuple[str, str, str, str]]) -> SqliteStore: + """Helper: create an in-memory store and populate KV with FedEntry items. + + Each tuple is (scope, term, layer, entity, definition) — scope is the guild id. + """ + store = SqliteStore() + for scope, term, layer, entity, definition in entries: # type: ignore[misc] + entry = FedEntry(term=term, layer=layer, entity=entity, definition=definition) + store.kv_set(scope, _kv_key(term, layer, entity), entry.to_json()) + return store + + +def test_channel_overrides_guild() -> None: + store = SqliteStore() + scope = "g1" + store.kv_set(scope, _kv_key("active_user", "guild", ""), FedEntry("active_user", "guild", "", "30d login").to_json()) + store.kv_set(scope, _kv_key("active_user", "channel", "c1"), FedEntry("active_user", "channel", "c1", "7d core action").to_json()) + + rendered = _render_effective(store, scope, "c1", "u1") + assert "7d core action" in rendered + assert "30d login" not in rendered + + +def test_guild_fills_gap_when_channel_missing() -> None: + store = SqliteStore() + scope = "g1" + store.kv_set(scope, _kv_key("revenue", "guild", ""), FedEntry("revenue", "guild", "", "net revenue").to_json()) + + rendered = _render_effective(store, scope, "c1", "u1") + assert "net revenue" in rendered + + +def test_member_overrides_channel_and_guild() -> None: + store = SqliteStore() + scope = "g1" + store.kv_set(scope, _kv_key("active_user", "guild", ""), FedEntry("active_user", "guild", "", "guild def").to_json()) + store.kv_set(scope, _kv_key("active_user", "channel", "c1"), FedEntry("active_user", "channel", "c1", "channel def").to_json()) + store.kv_set(scope, _kv_key("active_user", "member", "u1"), FedEntry("active_user", "member", "u1", "member def").to_json()) + + rendered = _render_effective(store, scope, "c1", "u1") + assert "member def" in rendered + assert "channel def" not in rendered + assert "guild def" not in rendered + + +def test_two_channels_isolated() -> None: + store = SqliteStore() + scope = "g1" + store.kv_set(scope, _kv_key("active_user", "channel", "mkt"), FedEntry("active_user", "channel", "mkt", "30d login").to_json()) + store.kv_set(scope, _kv_key("active_user", "channel", "fin"), FedEntry("active_user", "channel", "fin", "paid subscriber").to_json()) + + mkt = _render_effective(store, scope, "mkt", "u1") + fin = _render_effective(store, scope, "fin", "u2") + assert "30d login" in mkt + assert "paid subscriber" not in mkt + assert "paid subscriber" in fin + assert "30d login" not in fin + + +def test_empty_store_returns_no_terms() -> None: + store = SqliteStore() + rendered = _render_effective(store, "g1", "c1", "u1") + assert "등록된 용어가 없습니다" in rendered + + +def test_build_prompt_section_includes_ambiguous_term_policy() -> None: + store = SqliteStore() + section = build_prompt_section(store, "g1", "c1", "u1") + assert "Ambiguous Term Policy" in section diff --git a/tests/test_tenancy.py b/tests/test_tenancy.py index be148ac..a6afaa2 100644 --- a/tests/test_tenancy.py +++ b/tests/test_tenancy.py @@ -49,7 +49,7 @@ def test_build_context_populates_llm_and_session() -> None: assert ctx.session.identity == identity assert ctx.explorer is not None assert ctx.safety is not None - assert ctx.scope_resolver is not None + assert ctx.store is not None assert ctx.audit is not None finally: if saved is not None: