gladiaio · Karamouche · Apr 21, 2026 · Apr 17, 2026 · Apr 17, 2026 · Apr 21, 2026
diff --git a/normalization/languages/__init__.py b/normalization/languages/__init__.py
@@ -1,7 +1,15 @@
-from . import english, french, german, italian, spanish
+from . import dutch, english, french, german, italian, spanish
 from .base import LanguageOperators
 from .registry import get_language_registry, register_language
 
 register_language(LanguageOperators)
 
-__all__ = ["english", "french", "german", "italian", "spanish", "get_language_registry"]
+__all__ = [
+    "dutch",
+    "english",
+    "french",
+    "german",
+    "italian",
+    "spanish",
+    "get_language_registry",
+]
diff --git a/normalization/languages/dutch/__init__.py b/normalization/languages/dutch/__init__.py
@@ -0,0 +1,7 @@
+from .operators import DutchOperators
+from .replacements import DUTCH_REPLACEMENTS
+
+__all__ = [
+    "DutchOperators",
+    "DUTCH_REPLACEMENTS",
+]
diff --git a/normalization/languages/dutch/number_normalizer.py b/normalization/languages/dutch/number_normalizer.py
@@ -0,0 +1,144 @@
+"""Dutch number normalizer using text2num's alpha2digit.
+
+Converts spelled-out numbers to digits (e.g. vijf en twintig → 25) and handles
+mixed digit+word forms (e.g. 3 miljard → drie miljard) before conversion so
+alpha2digit does not misinterpret them. Optionally rewrites currency symbols to
+amount + spoken singular unit, then restores plural trailing words from config.
+"""
+
+import re
+
+from text_to_num import alpha2digit
+
+# Digit-to-Dutch-word mapping for normalizing "3 miljard" → "drie miljard".
+_DIGIT_TO_DUTCH: dict[str, str] = {
+    "0": "nul",
+    "1": "een",
+    "2": "twee",
+    "3": "drie",
+    "4": "vier",
+    "5": "vijf",
+    "6": "zes",
+    "7": "zeven",
+    "8": "acht",
+    "9": "negen",
+}
+
+# Pattern: digit(s) followed by Dutch large-number multipliers.
+_RE_MIXED_NUMBER = re.compile(
+    r"\b(\d+)\s+(miljoen|miljoenen|miljard|miljarden|biljoen|biljoenen)\b",
+    re.IGNORECASE,
+)
+
+
+def _normalize_mixed_numbers(text: str) -> str:
+    """Convert '3 miljard' → 'drie miljard' so alpha2digit yields 3e9, not '3 1000000000'.
+
+    alpha2digit may concatenate a lone digit with the following word; converting
+    the digit to a word avoids that (e.g. 'drie miljard' → 3000000000).
+    """
+
+    def replace(match: re.Match) -> str:
+        number = match.group(1)
+        multiplier = match.group(2)
+        if len(number) == 1 and number in _DIGIT_TO_DUTCH:
+            return f"{_DIGIT_TO_DUTCH[number]} {multiplier}"
+        # Multi-digit: keep as-is; alpha2digit will handle or leave unchanged
+        return match.group(0)
+
+    return _RE_MIXED_NUMBER.sub(replace, text)
+
+
+def _singular_spoken_unit(trailing_word: str) -> str:
+    """Map ``currency_symbol_to_word`` value to a spoken singular alpha2digit accepts."""
+    t = trailing_word.lower()
+    if t == "euros":
+        return "euro"
+    if t == "dollars":
+        return "dollar"
+    if t == "ponden":
+        return "pond"
+    if t == "yens":
+        return "yen"
+    return trailing_word
+
+
+def _normalize_currency_symbols(
+    text: str,
+    currency_symbol_to_word: dict[str, str] | None,
+) -> str:
+    if not currency_symbol_to_word:
+        return text
+    num = r"\d+(?:[.,]\d+)?"
+    for symbol, trailing in currency_symbol_to_word.items():
+        singular = _singular_spoken_unit(trailing)
+        esc = re.escape(symbol)
+        text = re.sub(rf"{esc}\s*({num})", rf"\1 {singular}", text, flags=re.IGNORECASE)
+        text = re.sub(rf"({num})\s*{esc}", rf"\1 {singular}", text, flags=re.IGNORECASE)
+    return text
+
+
+def _currency_plural_fix_patterns(
+    currency_symbol_to_word: dict[str, str] | None,
+) -> tuple[tuple[re.Pattern[str], str], ...]:
+    """Build (pattern, replacement) pairs so digit + alpha2digit singular → config trailing word."""
+    if not currency_symbol_to_word:
+        return ()
+    amount = r"(\d+(?:[.,]\d+)?)"
+    seen: set[str] = set()
+    out: list[tuple[re.Pattern[str], str]] = []
+    for _symbol, trailing in currency_symbol_to_word.items():
+        tl = trailing.lower()
+        if tl in seen:
+            continue
+        seen.add(tl)
+        singular = _singular_spoken_unit(trailing)
+        if singular.lower() == tl:
+            continue
+        if tl == "euros":
+            pat = re.compile(rf"\b{amount}\s+euro(?:'s)?\b", re.IGNORECASE)
+            out.append((pat, rf"\1 {trailing}"))
+        else:
+            pat = re.compile(
+                rf"\b{amount}\s+{re.escape(singular)}\b",
+                re.IGNORECASE,
+            )
+            out.append((pat, rf"\1 {trailing}"))
+    return tuple(out)
+
+
+def _apply_currency_plural_fixes(
+    text: str,
+    fixers: tuple[tuple[re.Pattern[str], str], ...],
+) -> str:
+    for pattern, repl in fixers:
+        text = pattern.sub(repl, text)
+    return text
+
+
+class DutchNumberNormalizer:
+    """Convert Dutch spelled-out numbers to digits via text2num.alpha2digit.
+
+    Applies pre-passes for currency symbols (when configured) and mixed digit+word
+    forms (e.g. 3 miljard) before calling alpha2digit, then normalizes currency
+    words to the plural forms in ``currency_symbol_to_word``.
+    """
+
+    def __init__(self, currency_symbol_to_word: dict[str, str] | None = None) -> None:
+        if alpha2digit is None:
+            raise ImportError(
+                "Dutch number normalization requires the text2num package. "
+                "Install it with: uv add text2num"
+            )
+        self._alpha2digit = alpha2digit
+        self._currency_symbol_to_word = currency_symbol_to_word
+        self._currency_plural_fixes = _currency_plural_fix_patterns(
+            currency_symbol_to_word,
+        )
+
+    def __call__(self, text: str) -> str:
+        text = _normalize_currency_symbols(text, self._currency_symbol_to_word)
+        text = _normalize_mixed_numbers(text)
+        text = self._alpha2digit(text, "nl")
+        text = _apply_currency_plural_fixes(text, self._currency_plural_fixes)
+        return text
diff --git a/normalization/languages/dutch/operators.py b/normalization/languages/dutch/operators.py
@@ -0,0 +1,116 @@
+import re
+
+from normalization.languages.base import (
+    LanguageConfig,
+    LanguageOperators,
+)
+from normalization.languages.dutch.number_normalizer import DutchNumberNormalizer
+from normalization.languages.dutch.sentence_replacements import (
+    DUTCH_SENTENCE_REPLACEMENTS,
+)
+from normalization.languages.registry import register_language
+
+# Flemish apostrophe clitics (straight or typographic apostrophe). (?<!\w) avoids
+# English-style possessives (e.g. Jan's) where the apostrophe follows a letter.
+_APOST = r"['\u2019]"
+_TEMPORAL_S_AFTER = (
+    r"ochtends|morgens|middags|namiddags|avonds|nachts|"
+    r"zaterdags|zondags|weekend|weekends"
+)
+_RE_TEMPORAL_S = re.compile(
+    rf"(?<!\w){_APOST}s(\s+)({_TEMPORAL_S_AFTER})\b",
+    re.IGNORECASE,
+)
+_RE_CLITIC_S = re.compile(rf"(?<!\w){_APOST}s\b", re.IGNORECASE)
+_RE_CLITIC_TRNKM = re.compile(rf"(?<!\w){_APOST}([trnkm])\b", re.IGNORECASE)
+
+_CLITIC_LETTER_TO_WORD = {
+    "t": "het",
+    "r": "er",
+    "n": "een",
+    "k": "ik",
+    "m": "hem",
+}
+
+DUTCH_CONFIG = LanguageConfig(
+    code="nl",
+    decimal_separator=",",
+    decimal_word="komma",
+    thousand_separator=" ",
+    symbols_to_words={
+        "@": "apenstaartje",
+        ".": "punt",
+        "+": "plus",
+        "=": "gelijk aan",
+        ">": "groter dan",
+        "<": "kleiner dan",
+        "°": "graden",
+        "°C": "graden celsius",
+        "°F": "graden fahrenheit",
+        "%": "procent",
+    },
+    currency_symbol_to_word={
+        "€": "euros",
+        "$": "dollars",
+        "£": "ponden",
+        "¢": "cent",
+        "¥": "yens",
+    },
+    filler_words=[
+        "ah",
+        "allee",
+        "alee",
+        "eh",
+        "ehm",
+        "hé",
+        "hè",
+        "he",
+        "hm",
+        "hmm",
+        "mm",
+        "mmm",
+        "mhm",
+        "nou",
+        "o",
+        "oke",
+        "okee",
+        "oké",
+        "uh",
+    ],
+    sentence_replacements=DUTCH_SENTENCE_REPLACEMENTS,
+)
+
+
+@register_language
+class DutchOperators(LanguageOperators):
+    def __init__(self):
+        super().__init__(DUTCH_CONFIG)
+        self._number_normalizer = DutchNumberNormalizer(
+            DUTCH_CONFIG.currency_symbol_to_word,
+        )
+
+    def expand_written_numbers(self, text: str) -> str:
+        """Convert Dutch spelled-out numbers to digits (vijf en twintig → 25).
+
+        Uses DutchNumberNormalizer, which normalizes currency symbols and mixed forms
+        (3 miljard → drie miljard), then text2num.alpha2digit.
+        """
+        return self._number_normalizer(text)
+
+    def expand_contractions(self, text: str) -> str:
+        def _temporal_sub(m: re.Match[str]) -> str:
+            return f"des{m.group(1)}{m.group(2).lower()}"
+
+        text = _RE_TEMPORAL_S.sub(_temporal_sub, text)
+        text = _RE_CLITIC_S.sub("is", text)
+
+        def _clitic_sub(m: re.Match[str]) -> str:
+            return _CLITIC_LETTER_TO_WORD[m.group(1).lower()]
+
+        text = _RE_CLITIC_TRNKM.sub(_clitic_sub, text)
+        return text
+
+    def get_word_replacements(self) -> dict[str, str]:
+        from normalization.languages.dutch.replacements import DUTCH_REPLACEMENTS
+
+        return DUTCH_REPLACEMENTS
diff --git a/normalization/languages/dutch/replacements.py b/normalization/languages/dutch/replacements.py
@@ -0,0 +1,28 @@
+"""Single-token Flemish / colloquial → standard Dutch (canonical for WER)."""
+
+DUTCH_REPLACEMENTS: dict[str, str] = {
+    # Flemish dialect → standard Dutch
+    "ge": "je",
+    "da": "dat",
+    "ne": "een",
+    "efkes": "even",
+    "effe": "even",
+    "awel": "wel",
+    "den": "de",
+    "mijne": "mijn",
+    "gij": "jij",
+    "zij": "ze",
+    "zijne": "zijn",
+    # Bare clitics (apostrophe dropped by ASR)
+    "t": "het",
+    "s": "is",
+    "r": "er",
+    "k": "ik",
+    # Formal / informal pronoun conflation (Flemish customer service)
+    # ref uses formal u/uw; models transcribe je — normalise to je
+    "u": "je",
+    "uw": "je",
+    # Spelling variants → canonical
+    "okee": "oke",  # oke is already in filler_words; okee must map to it
+    "euro": "euros",  # collapse singular/plural
+}
diff --git a/normalization/languages/dutch/sentence_replacements.py b/normalization/languages/dutch/sentence_replacements.py
@@ -0,0 +1,9 @@
+"""Multi-word and phrase-level normalization for Dutch (incl. Flemish variants)."""
+
+DUTCH_SENTENCE_REPLACEMENTS: dict[str, str] = {
+    "fifty fifty": "5050",
+    "fiftyfifty": "5050",
+    "checks": "cheques",
+    "goeiemiddag": "goedemiddag",
+    "kollega": "collega",
+}
diff --git a/tests/e2e/files/gladia-3/nl.csv b/tests/e2e/files/gladia-3/nl.csv
@@ -0,0 +1,25 @@
+input,expected
+tien euro,10 euros
+2 < 5,2 kleiner dan 5
+50°C,50 graden celsius
+ca kost €50,ca kost 50 euros
+"1.234,56",1234 komma 56
+dertien appels,13 appels
+kollega zegt hallo,collega zegt hallo
+ge weet da,je weet dat
+ik zeg 't zo,ik zeg het zo
+honderd euro,100 euros
+vijf dollar,5 dollars
+honderd euro's,100 euros
+"3,14",3 komma 14
+192.168.1.1,192 punt 168 punt 1 punt 1
+test@example.com,test apenstaartje example punt com
+www.example.com,w w w punt example punt com
+x = 5,x gelijk aan 5
+Het woord [inaudible] is hier,het woord inaudible is hier
+hallo eh daar,hallo daar
+mein  naam is Bob,mein naam is bob
+twee duizend,2000
+'s ochtends vroeg,des ochtends vroeg
+ping pong,ping pong
+vijf en twintig euro,25 euros