@@ -73,100 +73,95 @@ def load_alignment_map():
7373
7474def smart_lemmatize (word : str , adapter ) -> str :
7575 """
76- Enhanced lemmatization strategy:
77- 1. Polytonic Restoration -> OdyCy
78- 2. Alignment Check (OdyCy Lemma -> N1904 Lemma)
76+ TF-First Hybrid Lemmatization Strategy:
77+ 1. TF Direct (accent-insensitive) → Fast path for known NT words
78+ 2. OdyCy + Alignment → Fallback for unknown forms
79+ 3. Polytonic Restoration → Edge cases
7980 """
81+ import logging
82+ import time
83+
84+ start = time .time ()
85+
86+ # Load alignment map once
8087 alignment = load_alignment_map ()
8188
82- # helper to check alignment
8389 def check_alignment (lemma ):
90+ """Helper to check alignment corrections"""
8491 if alignment and lemma in alignment :
8592 return alignment [lemma ]
8693 return lemma
87-
88- # 1. OdyCy Initial Attempt
89- odycy_lemma = lemmatize (word )
9094
91- # Check alignment immediately?
92- # e.g. input="εἰλημμένος" -> odycy="εἰλημμένος" -> aligned="λαμβάνω"
93- aligned_lemma = check_alignment (odycy_lemma )
94- if aligned_lemma != odycy_lemma :
95- return aligned_lemma
96-
97- # If OdyCy returned unknown or unchanged word, try restoration
98-
99-
100- # 2. Check if this lemma exists in TF (exact match)
101- # If the input was perfect Polytonic, OdyCy likely got it right.
102- # We trust OdyCy if its output is a valid lemma in TF.
103- # But wait, purely relying on "is it in TF" might be wrong if OdyCy hallucinated a real word that is WRONG for this context?
104- # But here we have no context (single word).
105- # So "Is it a valid lemma?" is the best check we have.
95+ # Ensure indices are built
96+ adapter .build_stripped_index ()
97+ stripped_input = GreekNormalizer .strip_accents (word )
10698
107- # Fallback / Validation Logic
108- # We want to use OdyCy to Validate the "Restored" Polytonic form if the original input failed.
99+ # ==========================================
100+ # LAYER 1: TF DIRECT LOOKUP (Fast Path)
101+ # ==========================================
102+ tf_lemma = adapter .find_lemma_by_stripped_surface (stripped_input )
103+ if tf_lemma :
104+ elapsed = (time .time () - start ) * 1000
105+ logging .debug (f"Lemmatized '{ word } ' → '{ tf_lemma } ' in { elapsed :.1f} ms (layer: TF-direct)" )
106+ return tf_lemma
109107
110- # Heuristic: If OdyCy returns the word itself (unchanged) and that word is not a known lemma, it's likely a failure.
111- # Or if input was monotonic/unaccented, OdyCy output is trustworthy ONLY if it's a valid TF key.
112-
113- adapter .build_stripped_index ()
108+ # ==========================================
109+ # LAYER 2: ODYCY + ALIGNMENT (Fallback)
110+ # ==========================================
111+ odycy_lemma = lemmatize (word )
112+ aligned_lemma = check_alignment (odycy_lemma )
114113
115- # Manual Overrides logic moved to TextFabricAdapter.build_stripped_index
116- # The adapter now loads data/lexicons/supplemental_lemmas.json automatically
117- stripped_input = GreekNormalizer .strip_accents (word )
114+ if aligned_lemma != odycy_lemma :
115+ # Alignment corrected it
116+ elapsed = (time .time () - start ) * 1000
117+ logging .debug (f"Lemmatized '{ word } ' → '{ aligned_lemma } ' in { elapsed :.1f} ms (layer: OdyCy+alignment)" )
118+ return aligned_lemma
118119
119- # Step A: Polytonic Restoration (Exact)
120+ # ==========================================
121+ # LAYER 3: POLYTONIC RESTORATION (Edge Cases)
122+ # ==========================================
123+ # Try to restore polytonic forms and feed to OdyCy
120124 candidates = adapter .find_polytonic_surfaces (stripped_input )
121125
122- # Step A2: Polytonic Restoration (Fuzzy)
126+ # Fuzzy matching variations
123127 if not candidates :
124128 # Try movable nu
125129 if not stripped_input .endswith ("ν" ):
126- candidates = adapter .find_polytonic_surfaces (stripped_input + "ν" )
127-
128- # Try modern/ancient ending swap (-αν -> -ον)
130+ candidates = adapter .find_polytonic_surfaces (stripped_input + "ν" )
131+
132+ # Try modern/ancient ending swap (-αν → -ον)
129133 if not candidates and stripped_input .endswith ("αν" ):
130- # e.g. elaban -> elabon
131- variation = stripped_input [:- 2 ] + "ον"
132- candidates = adapter .find_polytonic_surfaces (variation )
133-
134+ variation = stripped_input [:- 2 ] + "ον"
135+ candidates = adapter .find_polytonic_surfaces (variation )
136+
134137 if candidates :
135- # We found potential Polytonic forms!
136- # Pick the best candidate (alphabetically first for stability)
138+ # Pick best candidate (alphabetically first for stability)
137139 best_candidate = sorted (list (candidates ))[0 ]
138140
139141 # Feed RESTORED form to OdyCy
140142 restored_lemma = lemmatize (best_candidate )
141143
142- # Check if the result is valid
143- stripped_chk = GreekNormalizer .strip_accents (restored_lemma )
144- if stripped_chk in adapter ._stripped_index :
145- return restored_lemma
146-
147- # Feature: Recursive Check via Alignment (Already covered above? Or double check?)
148- # Restored lemma might be different from initial lemma.
149- # e.g. restored="εἰλημμένος", map says "λαμβάνω".
144+ # Check alignment for restored lemma
150145 aligned_restored = check_alignment (restored_lemma )
151146 if aligned_restored != restored_lemma :
152- return aligned_restored
153-
154- # Final Fallback: Just return a TF match directly if exists
155-
156- # Final Fallback: Just return a TF match directly if exists
157- tf_candidates = adapter .find_lemmas_by_stripped_surface (stripped_input )
158- if tf_candidates :
159- return sorted (list (tf_candidates ))[0 ]
147+ elapsed = (time .time () - start ) * 1000
148+ logging .debug (f"Lemmatized '{ word } ' → '{ aligned_restored } ' in { elapsed :.1f} ms (layer: Restored+alignment)" )
149+ return aligned_restored
160150
151+ # Validate restored lemma exists in TF
152+ stripped_restored = GreekNormalizer .strip_accents (restored_lemma )
153+ if stripped_restored in adapter ._stripped_index :
154+ elapsed = (time .time () - start ) * 1000
155+ logging .debug (f"Lemmatized '{ word } ' → '{ restored_lemma } ' in { elapsed :.1f} ms (layer: Restored)" )
156+ return restored_lemma
157+
158+ # ==========================================
159+ # LAST RESORT: Return OdyCy result
160+ # ==========================================
161+ elapsed = (time .time () - start ) * 1000
162+ logging .debug (f"Lemmatized '{ word } ' → '{ odycy_lemma } ' in { elapsed :.1f} ms (layer: OdyCy-only, TF miss)" )
161163 return odycy_lemma
162164
163- if not candidate_lemmas :
164- # Neither OdyCy lemma nor Surface lookup found anything.
165- return odycy_lemma
166-
167- # We have candidates from Surface lookup.
168- # Return the first sorted one.
169- return sorted (list (candidate_lemmas ))[0 ]
170165
171166
172167def find_in_text_fabric (lemma : str , original_word : str , limit : int ):
@@ -300,11 +295,11 @@ def main():
300295
301296 word = args .word
302297
303- # Step 1: Lemmatize with OdyCy
304- lemma = lemmatize ( word )
305-
306- # Step 2: Search Text-Fabric
307- output = find_in_text_fabric (lemma , word , args .limit )
298+ # Step 1: Search Text-Fabric
299+ # We pass the original word as the 'lemma' initially.
300+ # find_in_text_fabric will call smart_lemmatize internally to find the real lemma.
301+ # This enables lazy-loading of OdyCy (only if TF lookup fails).
302+ output = find_in_text_fabric (word , word , args .limit )
308303
309304 # Check if the search yielded results.
310305 # If not, and if the lemma is same as word (OdyCy failure) or just no results,
0 commit comments