-
Notifications
You must be signed in to change notification settings - Fork 3
Add dutch language normalization #19
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from all commits
60c622e
b458749
f7a6b52
eb6f0a4
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -1,7 +1,15 @@ | ||
| from . import english, french, german, italian, spanish | ||
| from . import dutch, english, french, german, italian, spanish | ||
| from .base import LanguageOperators | ||
| from .registry import get_language_registry, register_language | ||
|
|
||
| register_language(LanguageOperators) | ||
|
|
||
| __all__ = ["english", "french", "german", "italian", "spanish", "get_language_registry"] | ||
| __all__ = [ | ||
| "dutch", | ||
| "english", | ||
| "french", | ||
| "german", | ||
| "italian", | ||
| "spanish", | ||
| "get_language_registry", | ||
| ] |
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,7 @@ | ||
| from .operators import DutchOperators | ||
| from .replacements import DUTCH_REPLACEMENTS | ||
|
|
||
| __all__ = [ | ||
| "DutchOperators", | ||
| "DUTCH_REPLACEMENTS", | ||
| ] |
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,144 @@ | ||
| """Dutch number normalizer using text2num's alpha2digit. | ||
|
|
||
| Converts spelled-out numbers to digits (e.g. vijf en twintig → 25) and handles | ||
| mixed digit+word forms (e.g. 3 miljard → drie miljard) before conversion so | ||
| alpha2digit does not misinterpret them. Optionally rewrites currency symbols to | ||
| amount + spoken singular unit, then restores plural trailing words from config. | ||
| """ | ||
|
|
||
| import re | ||
|
|
||
| from text_to_num import alpha2digit | ||
|
|
||
| # Digit-to-Dutch-word mapping for normalizing "3 miljard" → "drie miljard". | ||
| _DIGIT_TO_DUTCH: dict[str, str] = { | ||
| "0": "nul", | ||
| "1": "een", | ||
| "2": "twee", | ||
| "3": "drie", | ||
| "4": "vier", | ||
| "5": "vijf", | ||
| "6": "zes", | ||
| "7": "zeven", | ||
| "8": "acht", | ||
| "9": "negen", | ||
| } | ||
|
|
||
| # Pattern: digit(s) followed by Dutch large-number multipliers. | ||
| _RE_MIXED_NUMBER = re.compile( | ||
| r"\b(\d+)\s+(miljoen|miljoenen|miljard|miljarden|biljoen|biljoenen)\b", | ||
| re.IGNORECASE, | ||
| ) | ||
|
|
||
|
|
||
| def _normalize_mixed_numbers(text: str) -> str: | ||
| """Convert '3 miljard' → 'drie miljard' so alpha2digit yields 3e9, not '3 1000000000'. | ||
|
|
||
| alpha2digit may concatenate a lone digit with the following word; converting | ||
| the digit to a word avoids that (e.g. 'drie miljard' → 3000000000). | ||
| """ | ||
|
|
||
| def replace(match: re.Match) -> str: | ||
| number = match.group(1) | ||
| multiplier = match.group(2) | ||
| if len(number) == 1 and number in _DIGIT_TO_DUTCH: | ||
| return f"{_DIGIT_TO_DUTCH[number]} {multiplier}" | ||
| # Multi-digit: keep as-is; alpha2digit will handle or leave unchanged | ||
| return match.group(0) | ||
|
|
||
| return _RE_MIXED_NUMBER.sub(replace, text) | ||
|
Karamouche marked this conversation as resolved.
|
||
|
|
||
|
|
||
| def _singular_spoken_unit(trailing_word: str) -> str: | ||
| """Map ``currency_symbol_to_word`` value to a spoken singular alpha2digit accepts.""" | ||
| t = trailing_word.lower() | ||
| if t == "euros": | ||
| return "euro" | ||
| if t == "dollars": | ||
| return "dollar" | ||
| if t == "ponden": | ||
| return "pond" | ||
| if t == "yens": | ||
| return "yen" | ||
| return trailing_word | ||
|
|
||
|
|
||
| def _normalize_currency_symbols( | ||
| text: str, | ||
| currency_symbol_to_word: dict[str, str] | None, | ||
| ) -> str: | ||
| if not currency_symbol_to_word: | ||
| return text | ||
| num = r"\d+(?:[.,]\d+)?" | ||
| for symbol, trailing in currency_symbol_to_word.items(): | ||
| singular = _singular_spoken_unit(trailing) | ||
| esc = re.escape(symbol) | ||
| text = re.sub(rf"{esc}\s*({num})", rf"\1 {singular}", text, flags=re.IGNORECASE) | ||
| text = re.sub(rf"({num})\s*{esc}", rf"\1 {singular}", text, flags=re.IGNORECASE) | ||
| return text | ||
|
|
||
|
|
||
| def _currency_plural_fix_patterns( | ||
| currency_symbol_to_word: dict[str, str] | None, | ||
| ) -> tuple[tuple[re.Pattern[str], str], ...]: | ||
| """Build (pattern, replacement) pairs so digit + alpha2digit singular → config trailing word.""" | ||
| if not currency_symbol_to_word: | ||
| return () | ||
| amount = r"(\d+(?:[.,]\d+)?)" | ||
| seen: set[str] = set() | ||
| out: list[tuple[re.Pattern[str], str]] = [] | ||
| for _symbol, trailing in currency_symbol_to_word.items(): | ||
| tl = trailing.lower() | ||
| if tl in seen: | ||
| continue | ||
| seen.add(tl) | ||
| singular = _singular_spoken_unit(trailing) | ||
| if singular.lower() == tl: | ||
| continue | ||
| if tl == "euros": | ||
| pat = re.compile(rf"\b{amount}\s+euro(?:'s)?\b", re.IGNORECASE) | ||
| out.append((pat, rf"\1 {trailing}")) | ||
| else: | ||
| pat = re.compile( | ||
| rf"\b{amount}\s+{re.escape(singular)}\b", | ||
| re.IGNORECASE, | ||
| ) | ||
| out.append((pat, rf"\1 {trailing}")) | ||
| return tuple(out) | ||
|
|
||
|
|
||
| def _apply_currency_plural_fixes( | ||
| text: str, | ||
| fixers: tuple[tuple[re.Pattern[str], str], ...], | ||
| ) -> str: | ||
| for pattern, repl in fixers: | ||
| text = pattern.sub(repl, text) | ||
| return text | ||
|
|
||
|
|
||
| class DutchNumberNormalizer: | ||
| """Convert Dutch spelled-out numbers to digits via text2num.alpha2digit. | ||
|
|
||
| Applies pre-passes for currency symbols (when configured) and mixed digit+word | ||
| forms (e.g. 3 miljard) before calling alpha2digit, then normalizes currency | ||
| words to the plural forms in ``currency_symbol_to_word``. | ||
| """ | ||
|
|
||
| def __init__(self, currency_symbol_to_word: dict[str, str] | None = None) -> None: | ||
| if alpha2digit is None: | ||
| raise ImportError( | ||
| "Dutch number normalization requires the text2num package. " | ||
| "Install it with: uv add text2num" | ||
| ) | ||
| self._alpha2digit = alpha2digit | ||
| self._currency_symbol_to_word = currency_symbol_to_word | ||
| self._currency_plural_fixes = _currency_plural_fix_patterns( | ||
| currency_symbol_to_word, | ||
| ) | ||
|
Karamouche marked this conversation as resolved.
|
||
|
|
||
| def __call__(self, text: str) -> str: | ||
| text = _normalize_currency_symbols(text, self._currency_symbol_to_word) | ||
| text = _normalize_mixed_numbers(text) | ||
| text = self._alpha2digit(text, "nl") | ||
| text = _apply_currency_plural_fixes(text, self._currency_plural_fixes) | ||
| return text | ||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,116 @@ | ||
| import re | ||
|
|
||
| from normalization.languages.base import ( | ||
| LanguageConfig, | ||
| LanguageOperators, | ||
| ) | ||
| from normalization.languages.dutch.number_normalizer import DutchNumberNormalizer | ||
| from normalization.languages.dutch.sentence_replacements import ( | ||
| DUTCH_SENTENCE_REPLACEMENTS, | ||
| ) | ||
| from normalization.languages.registry import register_language | ||
|
|
||
| # Flemish apostrophe clitics (straight or typographic apostrophe). (?<!\w) avoids | ||
| # English-style possessives (e.g. Jan's) where the apostrophe follows a letter. | ||
| _APOST = r"['\u2019]" | ||
| _TEMPORAL_S_AFTER = ( | ||
| r"ochtends|morgens|middags|namiddags|avonds|nachts|" | ||
| r"zaterdags|zondags|weekend|weekends" | ||
| ) | ||
| _RE_TEMPORAL_S = re.compile( | ||
| rf"(?<!\w){_APOST}s(\s+)({_TEMPORAL_S_AFTER})\b", | ||
| re.IGNORECASE, | ||
| ) | ||
| _RE_CLITIC_S = re.compile(rf"(?<!\w){_APOST}s\b", re.IGNORECASE) | ||
|
Comment on lines
+16
to
+24
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
After the temporal pass consumes
These are less frequent than the informal "dat 's goed" → "dat is goed" case you want to cover, but silently wrong when they do appear. Consider either (a) extending Also applies to: 99-100 🤖 Prompt for AI Agents |
||
| _RE_CLITIC_TRNKM = re.compile(rf"(?<!\w){_APOST}([trnkm])\b", re.IGNORECASE) | ||
|
|
||
| _CLITIC_LETTER_TO_WORD = { | ||
| "t": "het", | ||
| "r": "er", | ||
| "n": "een", | ||
| "k": "ik", | ||
| "m": "hem", | ||
| } | ||
|
|
||
| DUTCH_CONFIG = LanguageConfig( | ||
| code="nl", | ||
| decimal_separator=",", | ||
| decimal_word="komma", | ||
| thousand_separator=" ", | ||
| symbols_to_words={ | ||
| "@": "apenstaartje", | ||
| ".": "punt", | ||
| "+": "plus", | ||
| "=": "gelijk aan", | ||
| ">": "groter dan", | ||
| "<": "kleiner dan", | ||
| "°": "graden", | ||
| "°C": "graden celsius", | ||
| "°F": "graden fahrenheit", | ||
| "%": "procent", | ||
| }, | ||
| currency_symbol_to_word={ | ||
| "€": "euros", | ||
| "$": "dollars", | ||
| "£": "ponden", | ||
| "¢": "cent", | ||
| "¥": "yens", | ||
| }, | ||
| filler_words=[ | ||
| "ah", | ||
| "allee", | ||
| "alee", | ||
| "eh", | ||
| "ehm", | ||
| "hé", | ||
| "hè", | ||
| "he", | ||
| "hm", | ||
| "hmm", | ||
| "mm", | ||
| "mmm", | ||
| "mhm", | ||
| "nou", | ||
| "o", | ||
| "oke", | ||
| "okee", | ||
| "oké", | ||
| "uh", | ||
| ], | ||
| sentence_replacements=DUTCH_SENTENCE_REPLACEMENTS, | ||
| ) | ||
|
|
||
|
|
||
| @register_language | ||
| class DutchOperators(LanguageOperators): | ||
| def __init__(self): | ||
| super().__init__(DUTCH_CONFIG) | ||
| self._number_normalizer = DutchNumberNormalizer( | ||
| DUTCH_CONFIG.currency_symbol_to_word, | ||
| ) | ||
|
|
||
| def expand_written_numbers(self, text: str) -> str: | ||
| """Convert Dutch spelled-out numbers to digits (vijf en twintig → 25). | ||
|
|
||
| Uses DutchNumberNormalizer, which normalizes currency symbols and mixed forms | ||
| (3 miljard → drie miljard), then text2num.alpha2digit. | ||
| """ | ||
| return self._number_normalizer(text) | ||
|
|
||
| def expand_contractions(self, text: str) -> str: | ||
| def _temporal_sub(m: re.Match[str]) -> str: | ||
| return f"des{m.group(1)}{m.group(2).lower()}" | ||
|
|
||
| text = _RE_TEMPORAL_S.sub(_temporal_sub, text) | ||
| text = _RE_CLITIC_S.sub("is", text) | ||
|
|
||
| def _clitic_sub(m: re.Match[str]) -> str: | ||
| return _CLITIC_LETTER_TO_WORD[m.group(1).lower()] | ||
|
|
||
| text = _RE_CLITIC_TRNKM.sub(_clitic_sub, text) | ||
| return text | ||
|
|
||
| def get_word_replacements(self) -> dict[str, str]: | ||
| from normalization.languages.dutch.replacements import DUTCH_REPLACEMENTS | ||
|
|
||
| return DUTCH_REPLACEMENTS | ||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,28 @@ | ||
| """Single-token Flemish / colloquial → standard Dutch (canonical for WER).""" | ||
|
|
||
| DUTCH_REPLACEMENTS: dict[str, str] = { | ||
| # Flemish dialect → standard Dutch | ||
| "ge": "je", | ||
| "da": "dat", | ||
| "ne": "een", | ||
| "efkes": "even", | ||
| "effe": "even", | ||
| "awel": "wel", | ||
| "den": "de", | ||
| "mijne": "mijn", | ||
| "gij": "jij", | ||
| "zij": "ze", | ||
| "zijne": "zijn", | ||
| # Bare clitics (apostrophe dropped by ASR) | ||
| "t": "het", | ||
| "s": "is", | ||
| "r": "er", | ||
| "k": "ik", | ||
| # Formal / informal pronoun conflation (Flemish customer service) | ||
| # ref uses formal u/uw; models transcribe je — normalise to je | ||
| "u": "je", | ||
| "uw": "je", | ||
|
Karamouche marked this conversation as resolved.
|
||
| # Spelling variants → canonical | ||
| "okee": "oke", # oke is already in filler_words; okee must map to it | ||
| "euro": "euros", # collapse singular/plural | ||
| } | ||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,9 @@ | ||
| """Multi-word and phrase-level normalization for Dutch (incl. Flemish variants).""" | ||
|
|
||
| DUTCH_SENTENCE_REPLACEMENTS: dict[str, str] = { | ||
| "fifty fifty": "5050", | ||
| "fiftyfifty": "5050", | ||
| "checks": "cheques", | ||
| "goeiemiddag": "goedemiddag", | ||
| "kollega": "collega", | ||
| } |
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,25 @@ | ||
| input,expected | ||
| tien euro,10 euros | ||
| 2 < 5,2 kleiner dan 5 | ||
| 50°C,50 graden celsius | ||
| ca kost €50,ca kost 50 euros | ||
| "1.234,56",1234 komma 56 | ||
| dertien appels,13 appels | ||
| kollega zegt hallo,collega zegt hallo | ||
| ge weet da,je weet dat | ||
| ik zeg 't zo,ik zeg het zo | ||
| honderd euro,100 euros | ||
| vijf dollar,5 dollars | ||
| honderd euro's,100 euros | ||
| "3,14",3 komma 14 | ||
| 192.168.1.1,192 punt 168 punt 1 punt 1 | ||
| test@example.com,test apenstaartje example punt com | ||
| www.example.com,w w w punt example punt com | ||
| x = 5,x gelijk aan 5 | ||
| Het woord [inaudible] is hier,het woord inaudible is hier | ||
| hallo eh daar,hallo daar | ||
| mein naam is Bob,mein naam is bob | ||
| twee duizend,2000 | ||
| 's ochtends vroeg,des ochtends vroeg | ||
| ping pong,ping pong | ||
| vijf en twintig euro,25 euros |
Uh oh!
There was an error while loading. Please reload this page.