Skip to content

Commit d09b9e3

Browse files
committed
TF search first
1 parent aa97374 commit d09b9e3

3 files changed

Lines changed: 85 additions & 71 deletions

File tree

src/adapters/text_fabric_adapter.py

Lines changed: 19 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -841,6 +841,25 @@ def find_polytonic_surfaces(self, stripped_surface: str) -> set:
841841
self.build_stripped_index()
842842

843843
return self._surface_index.get(stripped_surface, set())
844+
845+
def find_lemma_by_stripped_surface(self, stripped_word: str) -> Optional[str]:
846+
"""
847+
Find lemma directly by accent-insensitive surface match.
848+
This is the fast path for TF-first hybrid lemmatization.
849+
Returns the first lemma found (sorted for stability), or None.
850+
"""
851+
if not hasattr(self, '_stripped_index') or not self._stripped_index:
852+
self.build_stripped_index()
853+
854+
# _stripped_index maps stripped_form -> set(lemmas)
855+
# This includes both stripped variants of surface forms AND stripped lemmas themselves
856+
lemmas = self._stripped_index.get(stripped_word)
857+
858+
if lemmas:
859+
# Return first sorted lemma for stability
860+
return sorted(list(lemmas))[0]
861+
862+
return None
844863

845864
def get_chapter(self, book_code: str, chapter: int, version: str) -> List[Verse]:
846865
version = version.upper()

src/application/workers/find_worker.py

Lines changed: 65 additions & 70 deletions
Original file line numberDiff line numberDiff line change
@@ -73,100 +73,95 @@ def load_alignment_map():
7373

7474
def smart_lemmatize(word: str, adapter) -> str:
7575
"""
76-
Enhanced lemmatization strategy:
77-
1. Polytonic Restoration -> OdyCy
78-
2. Alignment Check (OdyCy Lemma -> N1904 Lemma)
76+
TF-First Hybrid Lemmatization Strategy:
77+
1. TF Direct (accent-insensitive) → Fast path for known NT words
78+
2. OdyCy + Alignment → Fallback for unknown forms
79+
3. Polytonic Restoration → Edge cases
7980
"""
81+
import logging
82+
import time
83+
84+
start = time.time()
85+
86+
# Load alignment map once
8087
alignment = load_alignment_map()
8188

82-
# helper to check alignment
8389
def check_alignment(lemma):
90+
"""Helper to check alignment corrections"""
8491
if alignment and lemma in alignment:
8592
return alignment[lemma]
8693
return lemma
87-
88-
# 1. OdyCy Initial Attempt
89-
odycy_lemma = lemmatize(word)
9094

91-
# Check alignment immediately?
92-
# e.g. input="εἰλημμένος" -> odycy="εἰλημμένος" -> aligned="λαμβάνω"
93-
aligned_lemma = check_alignment(odycy_lemma)
94-
if aligned_lemma != odycy_lemma:
95-
return aligned_lemma
96-
97-
# If OdyCy returned unknown or unchanged word, try restoration
98-
99-
100-
# 2. Check if this lemma exists in TF (exact match)
101-
# If the input was perfect Polytonic, OdyCy likely got it right.
102-
# We trust OdyCy if its output is a valid lemma in TF.
103-
# But wait, purely relying on "is it in TF" might be wrong if OdyCy hallucinated a real word that is WRONG for this context?
104-
# But here we have no context (single word).
105-
# So "Is it a valid lemma?" is the best check we have.
95+
# Ensure indices are built
96+
adapter.build_stripped_index()
97+
stripped_input = GreekNormalizer.strip_accents(word)
10698

107-
# Fallback / Validation Logic
108-
# We want to use OdyCy to Validate the "Restored" Polytonic form if the original input failed.
99+
# ==========================================
100+
# LAYER 1: TF DIRECT LOOKUP (Fast Path)
101+
# ==========================================
102+
tf_lemma = adapter.find_lemma_by_stripped_surface(stripped_input)
103+
if tf_lemma:
104+
elapsed = (time.time() - start) * 1000
105+
logging.debug(f"Lemmatized '{word}' → '{tf_lemma}' in {elapsed:.1f}ms (layer: TF-direct)")
106+
return tf_lemma
109107

110-
# Heuristic: If OdyCy returns the word itself (unchanged) and that word is not a known lemma, it's likely a failure.
111-
# Or if input was monotonic/unaccented, OdyCy output is trustworthy ONLY if it's a valid TF key.
112-
113-
adapter.build_stripped_index()
108+
# ==========================================
109+
# LAYER 2: ODYCY + ALIGNMENT (Fallback)
110+
# ==========================================
111+
odycy_lemma = lemmatize(word)
112+
aligned_lemma = check_alignment(odycy_lemma)
114113

115-
# Manual Overrides logic moved to TextFabricAdapter.build_stripped_index
116-
# The adapter now loads data/lexicons/supplemental_lemmas.json automatically
117-
stripped_input = GreekNormalizer.strip_accents(word)
114+
if aligned_lemma != odycy_lemma:
115+
# Alignment corrected it
116+
elapsed = (time.time() - start) * 1000
117+
logging.debug(f"Lemmatized '{word}' → '{aligned_lemma}' in {elapsed:.1f}ms (layer: OdyCy+alignment)")
118+
return aligned_lemma
118119

119-
# Step A: Polytonic Restoration (Exact)
120+
# ==========================================
121+
# LAYER 3: POLYTONIC RESTORATION (Edge Cases)
122+
# ==========================================
123+
# Try to restore polytonic forms and feed to OdyCy
120124
candidates = adapter.find_polytonic_surfaces(stripped_input)
121125

122-
# Step A2: Polytonic Restoration (Fuzzy)
126+
# Fuzzy matching variations
123127
if not candidates:
124128
# Try movable nu
125129
if not stripped_input.endswith("ν"):
126-
candidates = adapter.find_polytonic_surfaces(stripped_input + "ν")
127-
128-
# Try modern/ancient ending swap (-αν -> -ον)
130+
candidates = adapter.find_polytonic_surfaces(stripped_input + "ν")
131+
132+
# Try modern/ancient ending swap (-αν -ον)
129133
if not candidates and stripped_input.endswith("αν"):
130-
# e.g. elaban -> elabon
131-
variation = stripped_input[:-2] + "ον"
132-
candidates = adapter.find_polytonic_surfaces(variation)
133-
134+
variation = stripped_input[:-2] + "ον"
135+
candidates = adapter.find_polytonic_surfaces(variation)
136+
134137
if candidates:
135-
# We found potential Polytonic forms!
136-
# Pick the best candidate (alphabetically first for stability)
138+
# Pick best candidate (alphabetically first for stability)
137139
best_candidate = sorted(list(candidates))[0]
138140

139141
# Feed RESTORED form to OdyCy
140142
restored_lemma = lemmatize(best_candidate)
141143

142-
# Check if the result is valid
143-
stripped_chk = GreekNormalizer.strip_accents(restored_lemma)
144-
if stripped_chk in adapter._stripped_index:
145-
return restored_lemma
146-
147-
# Feature: Recursive Check via Alignment (Already covered above? Or double check?)
148-
# Restored lemma might be different from initial lemma.
149-
# e.g. restored="εἰλημμένος", map says "λαμβάνω".
144+
# Check alignment for restored lemma
150145
aligned_restored = check_alignment(restored_lemma)
151146
if aligned_restored != restored_lemma:
152-
return aligned_restored
153-
154-
# Final Fallback: Just return a TF match directly if exists
155-
156-
# Final Fallback: Just return a TF match directly if exists
157-
tf_candidates = adapter.find_lemmas_by_stripped_surface(stripped_input)
158-
if tf_candidates:
159-
return sorted(list(tf_candidates))[0]
147+
elapsed = (time.time() - start) * 1000
148+
logging.debug(f"Lemmatized '{word}' → '{aligned_restored}' in {elapsed:.1f}ms (layer: Restored+alignment)")
149+
return aligned_restored
160150

151+
# Validate restored lemma exists in TF
152+
stripped_restored = GreekNormalizer.strip_accents(restored_lemma)
153+
if stripped_restored in adapter._stripped_index:
154+
elapsed = (time.time() - start) * 1000
155+
logging.debug(f"Lemmatized '{word}' → '{restored_lemma}' in {elapsed:.1f}ms (layer: Restored)")
156+
return restored_lemma
157+
158+
# ==========================================
159+
# LAST RESORT: Return OdyCy result
160+
# ==========================================
161+
elapsed = (time.time() - start) * 1000
162+
logging.debug(f"Lemmatized '{word}' → '{odycy_lemma}' in {elapsed:.1f}ms (layer: OdyCy-only, TF miss)")
161163
return odycy_lemma
162164

163-
if not candidate_lemmas:
164-
# Neither OdyCy lemma nor Surface lookup found anything.
165-
return odycy_lemma
166-
167-
# We have candidates from Surface lookup.
168-
# Return the first sorted one.
169-
return sorted(list(candidate_lemmas))[0]
170165

171166

172167
def find_in_text_fabric(lemma: str, original_word: str, limit: int):
@@ -300,11 +295,11 @@ def main():
300295

301296
word = args.word
302297

303-
# Step 1: Lemmatize with OdyCy
304-
lemma = lemmatize(word)
305-
306-
# Step 2: Search Text-Fabric
307-
output = find_in_text_fabric(lemma, word, args.limit)
298+
# Step 1: Search Text-Fabric
299+
# We pass the original word as the 'lemma' initially.
300+
# find_in_text_fabric will call smart_lemmatize internally to find the real lemma.
301+
# This enables lazy-loading of OdyCy (only if TF lookup fails).
302+
output = find_in_text_fabric(word, word, args.limit)
308303

309304
# Check if the search yielded results.
310305
# If not, and if the lemma is same as word (OdyCy failure) or just no results,

src/cli.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -329,7 +329,7 @@ def find(
329329
import json
330330
import sys
331331

332-
typer.secho(f"Analyzing '{word}' with OdyCy...", fg=typer.colors.CYAN)
332+
typer.secho(f"Searching for '{word}'...", fg=typer.colors.CYAN)
333333

334334
# Locate worker script
335335
# Assuming src/cli.py is at [root]/src/cli.py

0 commit comments

Comments
 (0)