Skip to content

Commit 78cbbbb

Browse files
authored
Merge pull request #160 from dh-tech/feature/gregorian-human-lang-parsing
Add support for parsing Gregorian dates in standard text formats
2 parents 5b7db24 + c718db7 commit 78cbbbb

24 files changed

Lines changed: 438 additions & 27 deletions

CHANGELOG.md

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,11 @@
11
# Change Log
22

3+
## 0.7
4+
5+
- Add parsing to Gregorian date converter; supports month names (full or abbreviated)
6+
in English, French, German, Spanish, Kinyarwanda, Ganda, and Tigrinya
7+
- Include Gregorian dates in omnibus parser
8+
39
## 0.6
410

511
- Experimental omnibus date converter + parser (EDTF, Hebrew, Hijri)

DEVELOPER_NOTES.md

Lines changed: 16 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -88,4 +88,19 @@ pip install -e ".[docs]"
8888
sphinx-build docs docs/_build
8989
```
9090

91-
HTML documentation will be generated in `docs/_build/html`
91+
HTML documentation will be generated in `docs/_build/html`
92+
93+
94+
### Regenerating multilingual Gregorian month name parse file
95+
96+
The Gregorian Lark parser includes a script-generated file, which
97+
populates month names based on a list of language codes using the Babel
98+
library. To regenerate, run the script with hatch (which should
99+
be installed globally):
100+
101+
```sh
102+
hatch run codegen:generate
103+
```
104+
105+
When the `.lark` file is modified by the script, it must be committed to git.
106+

pyproject.toml

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -81,6 +81,12 @@ path = "src/undate/__init__.py"
8181
[tool.hatch.build.targets.sdist]
8282
include = ["src/undate/**/*.py", "src/undate/**/*.lark", "tests/**"]
8383

84+
[tool.hatch.envs.codegen]
85+
dependencies = ["babel"]
86+
87+
[tool.hatch.envs.codegen.scripts]
88+
generate = "python scripts/generate_gregorian_grammar.py"
89+
8490
[tool.pytest.ini_options]
8591
pythonpath = "src/"
8692
markers = [
Lines changed: 76 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,76 @@
1+
#!/usr/bin/env python
2+
"""
3+
This script generates the gregorian_multilang.lark file
4+
with month names (full and abbreviated) based on the list of
5+
target languages.
6+
7+
Run this script with hatch to regenerate the file::
8+
9+
hatch run codegen:generate
10+
11+
"""
12+
13+
from collections import defaultdict
14+
import pathlib
15+
16+
from babel.dates import get_month_names
17+
18+
# lark grammar path relative to this script
19+
GRAMMAR_DIR_PATH = (
20+
pathlib.Path(__file__).parent.parent / "src" / "undate" / "converters" / "grammars"
21+
)
22+
# file that is generated by this script, in that directory
23+
MONTH_GRAMMAR_FILE = GRAMMAR_DIR_PATH / "gregorian_multilang.lark"
24+
25+
# include month names in the following languages
26+
languages = [
27+
"en", # English
28+
"es", # Spanish
29+
"fr", # French
30+
"de", # German
31+
"rw", # Kinyarwanda
32+
"lg", # Ganda
33+
"ti", # Tigrinya
34+
]
35+
36+
# warning to include at top of generated file
37+
warning_text = """// WARNING: This file is auto-generated. DO NOT EDIT.
38+
// To regenerate: hatch run codegen:generate
39+
40+
"""
41+
42+
43+
def main():
44+
# create a dictionary of lists to hold the names for each month
45+
all_month_names = defaultdict(list)
46+
47+
for lang in languages:
48+
for width in ["wide", "abbreviated"]:
49+
for month_num, month_name in get_month_names(width, locale=lang).items():
50+
# some locales use a . on the shortened month; let's ignore that
51+
month_name = month_name.strip(".").lower()
52+
# In some cases different languages have the same abbreviations;
53+
# in some cases, abbreviated and full are the same.
54+
# Only add if not already present, to avoid redundancy
55+
if month_name not in all_month_names[month_num]:
56+
all_month_names[month_num].append(month_name)
57+
58+
with MONTH_GRAMMAR_FILE.open("w") as outfile:
59+
outfile.write(warning_text)
60+
61+
# for each numeric month, generate a rule with all variant names:
62+
# month_1: /January|Jan/i
63+
for i, names in all_month_names.items():
64+
# combine all names in a case-insensitive OR regex
65+
# sort shortest variants last to avoid partial matches hitting first
66+
or_names = "|".join(sorted(names, key=len, reverse=True))
67+
outfile.write(f"month_{i}: /({or_names})/i\n")
68+
69+
print(
70+
f"Successfully regenerated {MONTH_GRAMMAR_FILE.relative_to(pathlib.Path.cwd())}"
71+
)
72+
print("If the file has changed, make sure to commit the new version.")
73+
74+
75+
if __name__ == "__main__":
76+
main()
Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,3 @@
1+
from undate.converters.calendars.gregorian.converter import GregorianDateConverter
2+
3+
__all__ = ["GregorianDateConverter"]

src/undate/converters/calendars/gregorian.py renamed to src/undate/converters/calendars/gregorian/converter.py

Lines changed: 30 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,11 @@
11
from calendar import monthrange, isleap
22

3+
from lark.exceptions import UnexpectedInput
4+
5+
from undate.undate import Undate
36
from undate.converters.base import BaseCalendarConverter
7+
from undate.converters.calendars.gregorian.parser import gregorian_parser
8+
from undate.converters.calendars.gregorian.transformer import GregorianDateTransformer
49

510

611
class GregorianDateConverter(BaseCalendarConverter):
@@ -18,6 +23,9 @@ class GregorianDateConverter(BaseCalendarConverter):
1823
#: arbitrary known leap year
1924
LEAP_YEAR: int = 2024
2025

26+
def __init__(self):
27+
self.transformer = GregorianDateTransformer()
28+
2129
def min_month(self) -> int:
2230
"""First month for the Gregorian calendar."""
2331
return 1
@@ -79,3 +87,25 @@ def to_gregorian(self, year, month, day) -> tuple[int, int, int]:
7987
a common point of comparison.
8088
"""
8189
return (year, month, day)
90+
91+
def parse(self, value: str) -> Undate:
92+
"""
93+
Parse a Gregorian date string of any supported precision in any
94+
supported language and return an :class:`~undate.undate.Undate`.
95+
The input date string is preserved in the label of the resulting
96+
Undate object.
97+
"""
98+
if not value:
99+
raise ValueError("Parsing empty string is not supported")
100+
101+
# parse the input string, then transform to undate object
102+
try:
103+
# parse the string with our Gregorian date parser
104+
parsetree = gregorian_parser.parse(value)
105+
# transform the parse tree into an undate object
106+
undate_obj = self.transformer.transform(parsetree)
107+
# set the original date string as the label
108+
undate_obj.label = value
109+
return undate_obj
110+
except UnexpectedInput as err:
111+
raise ValueError(f"Could not parse '{value}' as a Gregorian date") from err
Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,10 @@
1+
from lark import Lark
2+
3+
from undate.converters import GRAMMAR_FILE_PATH
4+
5+
grammar_path = GRAMMAR_FILE_PATH / "gregorian.lark"
6+
7+
# open based on filename to allow relative imports based on grammar file
8+
gregorian_parser = Lark.open(
9+
str(grammar_path), rel_to=__file__, start="gregorian_date", strict=True
10+
)
Lines changed: 42 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,42 @@
1+
from lark import Transformer, Tree
2+
3+
from undate import Undate, Calendar
4+
5+
6+
class GregorianDateTransformer(Transformer):
7+
"""Transform a Gregorian date parse tree and return an Undate."""
8+
9+
# Currently parser should not result in intervals
10+
11+
calendar = Calendar.GREGORIAN
12+
13+
def gregorian_date(self, items):
14+
parts = {}
15+
for child in items:
16+
if child.data in ["year", "month", "day"]:
17+
# in each case we expect one integer value;
18+
# anonymous tokens convert to their value and cast as int
19+
value = int(child.children[0])
20+
parts[str(child.data)] = value
21+
22+
# initialize and return an undate with year, month, day and
23+
# Gregorian calendar
24+
return Undate(**parts, calendar=self.calendar)
25+
26+
def year(self, items):
27+
# combine multiple parts into a single string
28+
value = "".join([str(i) for i in items])
29+
return Tree(data="year", children=[value])
30+
31+
def month(self, items):
32+
# month has a nested tree for the rule and the value
33+
# the name of the rule (month_1, month_2, etc) gives us the
34+
# number of the month needed for converting the date
35+
tree = items[0]
36+
month_n = tree.data.split("_")[-1]
37+
return Tree(data="month", children=[month_n])
38+
39+
def day(self, items):
40+
# combine multiple parts into a single string
41+
value = "".join([str(i) for i in items])
42+
return Tree(data="day", children=[value])

src/undate/converters/calendars/hebrew/converter.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,7 @@
11
from typing import Union
22

33
from convertdate import hebrew # type: ignore
4-
from lark.exceptions import UnexpectedCharacters
4+
from lark.exceptions import UnexpectedInput
55

66
from undate import Undate, UndateInterval
77
from undate.converters.base import BaseCalendarConverter
@@ -111,7 +111,7 @@ def parse(self, value: str) -> Union[Undate, UndateInterval]:
111111
# set the original date as a label, with the calendar name
112112
undate_obj.label = f"{value} {self.calendar_name}"
113113
return undate_obj
114-
except UnexpectedCharacters as err:
114+
except UnexpectedInput as err:
115115
raise ValueError(f"Could not parse '{value}' as a Hebrew date") from err
116116

117117
# do we need to support conversion the other direction?

src/undate/converters/calendars/hebrew/parser.py

Lines changed: 4 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -4,6 +4,7 @@
44

55
grammar_path = GRAMMAR_FILE_PATH / "hebrew.lark"
66

7-
with open(grammar_path) as grammar:
8-
# NOTE: LALR parser is faster but can't be used to ambiguity between years and dates
9-
hebrew_parser = Lark(grammar.read(), start="hebrew_date", strict=True)
7+
# open based on filename to allow relative imports based on grammar file
8+
hebrew_parser = Lark.open(
9+
str(grammar_path), rel_to=__file__, start="hebrew_date", strict=True
10+
)

0 commit comments

Comments
 (0)