Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
12 changes: 10 additions & 2 deletions normalization/languages/__init__.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,15 @@
from . import english, french, german, italian, spanish
from . import dutch, english, french, german, italian, spanish
from .base import LanguageOperators
from .registry import get_language_registry, register_language

register_language(LanguageOperators)

__all__ = ["english", "french", "german", "italian", "spanish", "get_language_registry"]
__all__ = [
"dutch",
"english",
"french",
"german",
"italian",
"spanish",
"get_language_registry",
]
7 changes: 7 additions & 0 deletions normalization/languages/dutch/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,7 @@
from .operators import DutchOperators
from .replacements import DUTCH_REPLACEMENTS

__all__ = [
"DutchOperators",
"DUTCH_REPLACEMENTS",
]
144 changes: 144 additions & 0 deletions normalization/languages/dutch/number_normalizer.py
Comment thread
Karamouche marked this conversation as resolved.
Original file line number Diff line number Diff line change
@@ -0,0 +1,144 @@
"""Dutch number normalizer using text2num's alpha2digit.

Converts spelled-out numbers to digits (e.g. vijf en twintig → 25) and handles
mixed digit+word forms (e.g. 3 miljard → drie miljard) before conversion so
alpha2digit does not misinterpret them. Optionally rewrites currency symbols to
amount + spoken singular unit, then restores plural trailing words from config.
"""

import re

from text_to_num import alpha2digit

# Digit-to-Dutch-word mapping for normalizing "3 miljard" → "drie miljard".
_DIGIT_TO_DUTCH: dict[str, str] = {
"0": "nul",
"1": "een",
"2": "twee",
"3": "drie",
"4": "vier",
"5": "vijf",
"6": "zes",
"7": "zeven",
"8": "acht",
"9": "negen",
}

# Pattern: digit(s) followed by Dutch large-number multipliers.
_RE_MIXED_NUMBER = re.compile(
r"\b(\d+)\s+(miljoen|miljoenen|miljard|miljarden|biljoen|biljoenen)\b",
re.IGNORECASE,
)


def _normalize_mixed_numbers(text: str) -> str:
"""Convert '3 miljard' → 'drie miljard' so alpha2digit yields 3e9, not '3 1000000000'.

alpha2digit may concatenate a lone digit with the following word; converting
the digit to a word avoids that (e.g. 'drie miljard' → 3000000000).
"""

def replace(match: re.Match) -> str:
number = match.group(1)
multiplier = match.group(2)
if len(number) == 1 and number in _DIGIT_TO_DUTCH:
return f"{_DIGIT_TO_DUTCH[number]} {multiplier}"
# Multi-digit: keep as-is; alpha2digit will handle or leave unchanged
return match.group(0)

return _RE_MIXED_NUMBER.sub(replace, text)
Comment thread
Karamouche marked this conversation as resolved.


def _singular_spoken_unit(trailing_word: str) -> str:
"""Map ``currency_symbol_to_word`` value to a spoken singular alpha2digit accepts."""
t = trailing_word.lower()
if t == "euros":
return "euro"
if t == "dollars":
return "dollar"
if t == "ponden":
return "pond"
if t == "yens":
return "yen"
return trailing_word


def _normalize_currency_symbols(
text: str,
currency_symbol_to_word: dict[str, str] | None,
) -> str:
if not currency_symbol_to_word:
return text
num = r"\d+(?:[.,]\d+)?"
for symbol, trailing in currency_symbol_to_word.items():
singular = _singular_spoken_unit(trailing)
esc = re.escape(symbol)
text = re.sub(rf"{esc}\s*({num})", rf"\1 {singular}", text, flags=re.IGNORECASE)
text = re.sub(rf"({num})\s*{esc}", rf"\1 {singular}", text, flags=re.IGNORECASE)
return text


def _currency_plural_fix_patterns(
currency_symbol_to_word: dict[str, str] | None,
) -> tuple[tuple[re.Pattern[str], str], ...]:
"""Build (pattern, replacement) pairs so digit + alpha2digit singular → config trailing word."""
if not currency_symbol_to_word:
return ()
amount = r"(\d+(?:[.,]\d+)?)"
seen: set[str] = set()
out: list[tuple[re.Pattern[str], str]] = []
for _symbol, trailing in currency_symbol_to_word.items():
tl = trailing.lower()
if tl in seen:
continue
seen.add(tl)
singular = _singular_spoken_unit(trailing)
if singular.lower() == tl:
continue
if tl == "euros":
pat = re.compile(rf"\b{amount}\s+euro(?:'s)?\b", re.IGNORECASE)
out.append((pat, rf"\1 {trailing}"))
else:
pat = re.compile(
rf"\b{amount}\s+{re.escape(singular)}\b",
re.IGNORECASE,
)
out.append((pat, rf"\1 {trailing}"))
return tuple(out)


def _apply_currency_plural_fixes(
text: str,
fixers: tuple[tuple[re.Pattern[str], str], ...],
) -> str:
for pattern, repl in fixers:
text = pattern.sub(repl, text)
return text


class DutchNumberNormalizer:
"""Convert Dutch spelled-out numbers to digits via text2num.alpha2digit.

Applies pre-passes for currency symbols (when configured) and mixed digit+word
forms (e.g. 3 miljard) before calling alpha2digit, then normalizes currency
words to the plural forms in ``currency_symbol_to_word``.
"""

def __init__(self, currency_symbol_to_word: dict[str, str] | None = None) -> None:
if alpha2digit is None:
raise ImportError(
"Dutch number normalization requires the text2num package. "
"Install it with: uv add text2num"
)
self._alpha2digit = alpha2digit
self._currency_symbol_to_word = currency_symbol_to_word
self._currency_plural_fixes = _currency_plural_fix_patterns(
currency_symbol_to_word,
)
Comment thread
Karamouche marked this conversation as resolved.

def __call__(self, text: str) -> str:
text = _normalize_currency_symbols(text, self._currency_symbol_to_word)
text = _normalize_mixed_numbers(text)
text = self._alpha2digit(text, "nl")
text = _apply_currency_plural_fixes(text, self._currency_plural_fixes)
return text
116 changes: 116 additions & 0 deletions normalization/languages/dutch/operators.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,116 @@
import re

from normalization.languages.base import (
LanguageConfig,
LanguageOperators,
)
from normalization.languages.dutch.number_normalizer import DutchNumberNormalizer
from normalization.languages.dutch.sentence_replacements import (
DUTCH_SENTENCE_REPLACEMENTS,
)
from normalization.languages.registry import register_language

# Flemish apostrophe clitics (straight or typographic apostrophe). (?<!\w) avoids
# English-style possessives (e.g. Jan's) where the apostrophe follows a letter.
_APOST = r"['\u2019]"
_TEMPORAL_S_AFTER = (
r"ochtends|morgens|middags|namiddags|avonds|nachts|"
r"zaterdags|zondags|weekend|weekends"
)
_RE_TEMPORAL_S = re.compile(
rf"(?<!\w){_APOST}s(\s+)({_TEMPORAL_S_AFTER})\b",
re.IGNORECASE,
)
_RE_CLITIC_S = re.compile(rf"(?<!\w){_APOST}s\b", re.IGNORECASE)
Comment on lines +16 to +24
Copy link
Copy Markdown

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

⚠️ Potential issue | 🟡 Minor

_RE_CLITIC_S may over-expand non-temporal 's to is.

After the temporal pass consumes 's ochtends/avonds/..., any remaining 's preceded by a non-word char (start of string, space, punctuation) is unconditionally replaced by is. That corrupts genuine Dutch constructions where standalone 's is a reduced genitive/adverbial article, e.g.:

  • 's werelds grootste (= "of the world's largest") → is werelds grootste
  • 's lands belangenis lands belangen

These are less frequent than the informal "dat 's goed" → "dat is goed" case you want to cover, but silently wrong when they do appear. Consider either (a) extending _TEMPORAL_S_AFTER to include werelds|lands|..., or (b) only replacing 's with is when the previous token ends with a vowel/word consistent with a reduced copula (e.g. require (?:dat|hoe|wat|er|hier|daar)\s+ before it).

Also applies to: 99-100

🤖 Prompt for AI Agents
Verify each finding against the current code and only fix it if needed.

In `@normalization/languages/dutch/operators.py` around lines 16 - 24, The current
_RE_CLITIC_S unconditionally matches any leading non-word then _APOST s and will
incorrectly convert genitive/adverbial constructions like "'s werelds" to "is
werelds"; update the replacement regex logic so we only convert reduced copula
instances: replace _RE_CLITIC_S with a pattern that requires a copula-like
preceding token (e.g. use a positive lookbehind such as
(?<=\b(?:dat|hoe|wat|er|hier|daar)\s){_APOST}s\b) or alternatively add
exclusions by extending _TEMPORAL_S_AFTER to include known genitive words like
werelds|lands if you prefer the whitelist approach; adjust the code that
compiles _RE_CLITIC_S to use the chosen narrower pattern so only true "'s"
copula cases are replaced.

_RE_CLITIC_TRNKM = re.compile(rf"(?<!\w){_APOST}([trnkm])\b", re.IGNORECASE)

_CLITIC_LETTER_TO_WORD = {
"t": "het",
"r": "er",
"n": "een",
"k": "ik",
"m": "hem",
}

DUTCH_CONFIG = LanguageConfig(
code="nl",
decimal_separator=",",
decimal_word="komma",
thousand_separator=" ",
symbols_to_words={
"@": "apenstaartje",
".": "punt",
"+": "plus",
"=": "gelijk aan",
">": "groter dan",
"<": "kleiner dan",
"°": "graden",
"°C": "graden celsius",
"°F": "graden fahrenheit",
"%": "procent",
},
currency_symbol_to_word={
"€": "euros",
"$": "dollars",
"£": "ponden",
"¢": "cent",
"¥": "yens",
},
filler_words=[
"ah",
"allee",
"alee",
"eh",
"ehm",
"hé",
"hè",
"he",
"hm",
"hmm",
"mm",
"mmm",
"mhm",
"nou",
"o",
"oke",
"okee",
"oké",
"uh",
],
sentence_replacements=DUTCH_SENTENCE_REPLACEMENTS,
)


@register_language
class DutchOperators(LanguageOperators):
def __init__(self):
super().__init__(DUTCH_CONFIG)
self._number_normalizer = DutchNumberNormalizer(
DUTCH_CONFIG.currency_symbol_to_word,
)

def expand_written_numbers(self, text: str) -> str:
"""Convert Dutch spelled-out numbers to digits (vijf en twintig → 25).

Uses DutchNumberNormalizer, which normalizes currency symbols and mixed forms
(3 miljard → drie miljard), then text2num.alpha2digit.
"""
return self._number_normalizer(text)

def expand_contractions(self, text: str) -> str:
def _temporal_sub(m: re.Match[str]) -> str:
return f"des{m.group(1)}{m.group(2).lower()}"

text = _RE_TEMPORAL_S.sub(_temporal_sub, text)
text = _RE_CLITIC_S.sub("is", text)

def _clitic_sub(m: re.Match[str]) -> str:
return _CLITIC_LETTER_TO_WORD[m.group(1).lower()]

text = _RE_CLITIC_TRNKM.sub(_clitic_sub, text)
return text

def get_word_replacements(self) -> dict[str, str]:
from normalization.languages.dutch.replacements import DUTCH_REPLACEMENTS

return DUTCH_REPLACEMENTS
28 changes: 28 additions & 0 deletions normalization/languages/dutch/replacements.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,28 @@
"""Single-token Flemish / colloquial → standard Dutch (canonical for WER)."""

DUTCH_REPLACEMENTS: dict[str, str] = {
# Flemish dialect → standard Dutch
"ge": "je",
"da": "dat",
"ne": "een",
"efkes": "even",
"effe": "even",
"awel": "wel",
"den": "de",
"mijne": "mijn",
"gij": "jij",
"zij": "ze",
"zijne": "zijn",
# Bare clitics (apostrophe dropped by ASR)
"t": "het",
"s": "is",
"r": "er",
"k": "ik",
# Formal / informal pronoun conflation (Flemish customer service)
# ref uses formal u/uw; models transcribe je — normalise to je
"u": "je",
"uw": "je",
Comment thread
Karamouche marked this conversation as resolved.
# Spelling variants → canonical
"okee": "oke", # oke is already in filler_words; okee must map to it
"euro": "euros", # collapse singular/plural
}
9 changes: 9 additions & 0 deletions normalization/languages/dutch/sentence_replacements.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,9 @@
"""Multi-word and phrase-level normalization for Dutch (incl. Flemish variants)."""

DUTCH_SENTENCE_REPLACEMENTS: dict[str, str] = {
"fifty fifty": "5050",
"fiftyfifty": "5050",
"checks": "cheques",
"goeiemiddag": "goedemiddag",
"kollega": "collega",
}
25 changes: 25 additions & 0 deletions tests/e2e/files/gladia-3/nl.csv
Original file line number Diff line number Diff line change
@@ -0,0 +1,25 @@
input,expected
tien euro,10 euros
2 < 5,2 kleiner dan 5
50°C,50 graden celsius
ca kost €50,ca kost 50 euros
"1.234,56",1234 komma 56
dertien appels,13 appels
kollega zegt hallo,collega zegt hallo
ge weet da,je weet dat
ik zeg 't zo,ik zeg het zo
honderd euro,100 euros
vijf dollar,5 dollars
honderd euro's,100 euros
"3,14",3 komma 14
192.168.1.1,192 punt 168 punt 1 punt 1
test@example.com,test apenstaartje example punt com
www.example.com,w w w punt example punt com
x = 5,x gelijk aan 5
Het woord [inaudible] is hier,het woord inaudible is hier
hallo eh daar,hallo daar
mein naam is Bob,mein naam is bob
twee duizend,2000
's ochtends vroeg,des ochtends vroeg
ping pong,ping pong
vijf en twintig euro,25 euros
Loading