dh-tech
diff --git a/‎CHANGELOG.md‎
Lines changed: 6 additions & 0 deletions b/‎CHANGELOG.md‎
Lines changed: 6 additions & 0 deletions
diff --git a/‎DEVELOPER_NOTES.md‎
Lines changed: 16 additions & 1 deletion b/‎DEVELOPER_NOTES.md‎
Lines changed: 16 additions & 1 deletion
diff --git a/‎pyproject.toml‎
Lines changed: 6 additions & 0 deletions b/‎pyproject.toml‎
Lines changed: 6 additions & 0 deletions
diff --git a/‎scripts/generate_gregorian_grammar.py‎
Lines changed: 76 additions & 0 deletions b/‎scripts/generate_gregorian_grammar.py‎
Lines changed: 76 additions & 0 deletions
diff --git a/‎src/undate/converters/calendars/gregorian/__init__.py‎
Lines changed: 3 additions & 0 deletions b/‎src/undate/converters/calendars/gregorian/__init__.py‎
Lines changed: 3 additions & 0 deletions
diff --git a/‎…undate/converters/calendars/gregorian.py‎ ‎…verters/calendars/gregorian/converter.py‎src/undate/converters/calendars/gregorian.py renamed to src/undate/converters/calendars/gregorian/converter.py
Lines changed: 30 additions & 0 deletions b/‎…undate/converters/calendars/gregorian.py‎ ‎…verters/calendars/gregorian/converter.py‎src/undate/converters/calendars/gregorian.py renamed to src/undate/converters/calendars/gregorian/converter.py
Lines changed: 30 additions & 0 deletions
diff --git a/‎src/undate/converters/calendars/gregorian/parser.py‎
Lines changed: 10 additions & 0 deletions b/‎src/undate/converters/calendars/gregorian/parser.py‎
Lines changed: 10 additions & 0 deletions
diff --git a/‎src/undate/converters/calendars/gregorian/transformer.py‎
Lines changed: 42 additions & 0 deletions b/‎src/undate/converters/calendars/gregorian/transformer.py‎
Lines changed: 42 additions & 0 deletions
diff --git a/‎src/undate/converters/calendars/hebrew/converter.py‎
Lines changed: 2 additions & 2 deletions b/‎src/undate/converters/calendars/hebrew/converter.py‎
Lines changed: 2 additions & 2 deletions
diff --git a/‎src/undate/converters/calendars/hebrew/parser.py‎
Lines changed: 4 additions & 3 deletions b/‎src/undate/converters/calendars/hebrew/parser.py‎
Lines changed: 4 additions & 3 deletions
@@ -1,5 +1,11 @@
 # Change Log
 
+## 0.7
+
+- Add parsing to Gregorian date converter; supports month names (full or abbreviated)
+  in English, French, German, Spanish, Kinyarwanda, Ganda, and Tigrinya
+- Include Gregorian dates in omnibus parser
+
 ## 0.6
 
 - Experimental omnibus date converter + parser (EDTF, Hebrew, Hijri)
 
@@ -88,4 +88,19 @@ pip install -e ".[docs]"
 sphinx-build docs docs/_build
 ```
 
-HTML documentation will be generated in `docs/_build/html`
+HTML documentation will be generated in `docs/_build/html`
+
+
+### Regenerating multilingual Gregorian month name parse file
+
+The Gregorian Lark parser includes a script-generated file, which
+populates month names based on a list of language codes using the Babel
+library.  To regenerate, run the script with hatch (which should
+be installed globally):
+
+```sh
+hatch run codegen:generate
+```
+    
+When the `.lark` file is modified by the script, it must be committed to git.
+
@@ -81,6 +81,12 @@ path = "src/undate/__init__.py"
 [tool.hatch.build.targets.sdist]
 include = ["src/undate/**/*.py", "src/undate/**/*.lark", "tests/**"]
 
+[tool.hatch.envs.codegen]
+dependencies = ["babel"]
+
+[tool.hatch.envs.codegen.scripts]
+generate = "python scripts/generate_gregorian_grammar.py"
+
 [tool.pytest.ini_options]
 pythonpath = "src/"
 markers = [
 
@@ -0,0 +1,76 @@
+#!/usr/bin/env python
+"""
+This script generates the gregorian_multilang.lark file
+with month names (full and abbreviated) based on the list of
+target languages.
+
+Run this script with hatch to regenerate the file::
+
+    hatch run codegen:generate
+
+"""
+
+from collections import defaultdict
+import pathlib
+
+from babel.dates import get_month_names
+
+# lark grammar path relative to this script
+GRAMMAR_DIR_PATH = (
+    pathlib.Path(__file__).parent.parent / "src" / "undate" / "converters" / "grammars"
+)
+# file that is generated by this script, in that directory
+MONTH_GRAMMAR_FILE = GRAMMAR_DIR_PATH / "gregorian_multilang.lark"
+
+# include month names in the following languages
+languages = [
+    "en",  # English
+    "es",  # Spanish
+    "fr",  # French
+    "de",  # German
+    "rw",  # Kinyarwanda
+    "lg",  # Ganda
+    "ti",  # Tigrinya
+]
+
+# warning to include at top of generated file
+warning_text = """// WARNING: This file is auto-generated. DO NOT EDIT.
+// To regenerate: hatch run codegen:generate
+
+"""
+
+
+def main():
+    # create a dictionary of lists to hold the names for each month
+    all_month_names = defaultdict(list)
+
+    for lang in languages:
+        for width in ["wide", "abbreviated"]:
+            for month_num, month_name in get_month_names(width, locale=lang).items():
+                # some locales use a . on the shortened month; let's ignore that
+                month_name = month_name.strip(".").lower()
+                # In some cases different languages have the same abbreviations;
+                # in some cases, abbreviated and full are the same.
+                # Only add if not already present, to avoid redundancy
+                if month_name not in all_month_names[month_num]:
+                    all_month_names[month_num].append(month_name)
+
+    with MONTH_GRAMMAR_FILE.open("w") as outfile:
+        outfile.write(warning_text)
+
+        # for each numeric month, generate a rule with all variant names:
+        # month_1:  /January|Jan/i
+        for i, names in all_month_names.items():
+            # combine all names in a case-insensitive OR regex
+            # sort shortest variants last to avoid partial matches hitting first
+            or_names = "|".join(sorted(names, key=len, reverse=True))
+            outfile.write(f"month_{i}: /({or_names})/i\n")
+
+    print(
+        f"Successfully regenerated {MONTH_GRAMMAR_FILE.relative_to(pathlib.Path.cwd())}"
+    )
+    print("If the file has changed, make sure to commit the new version.")
+
+
+if __name__ == "__main__":
+    main()
@@ -0,0 +1,3 @@
+from undate.converters.calendars.gregorian.converter import GregorianDateConverter
+
+__all__ = ["GregorianDateConverter"]
@@ -1,6 +1,11 @@
 from calendar import monthrange, isleap
 
+from lark.exceptions import UnexpectedInput
+
+from undate.undate import Undate
 from undate.converters.base import BaseCalendarConverter
+from undate.converters.calendars.gregorian.parser import gregorian_parser
+from undate.converters.calendars.gregorian.transformer import GregorianDateTransformer
 
 
 class GregorianDateConverter(BaseCalendarConverter):
@@ -18,6 +23,9 @@ class GregorianDateConverter(BaseCalendarConverter):
     #: arbitrary known leap year
     LEAP_YEAR: int = 2024
 
+    def __init__(self):
+        self.transformer = GregorianDateTransformer()
+
     def min_month(self) -> int:
         """First month for the Gregorian calendar."""
         return 1
@@ -79,3 +87,25 @@ def to_gregorian(self, year, month, day) -> tuple[int, int, int]:
         a common point of comparison.
         """
         return (year, month, day)
+
+    def parse(self, value: str) -> Undate:
+        """
+        Parse a Gregorian date string of any supported precision in any
+        supported language and return an :class:`~undate.undate.Undate`.
+        The input date string is preserved in the label of the resulting
+        Undate object.
+        """
+        if not value:
+            raise ValueError("Parsing empty string is not supported")
+
+        # parse the input string, then transform to undate object
+        try:
+            # parse the string with our Gregorian date parser
+            parsetree = gregorian_parser.parse(value)
+            # transform the parse tree into an undate object
+            undate_obj = self.transformer.transform(parsetree)
+            # set the original date string as the label
+            undate_obj.label = value
+            return undate_obj
+        except UnexpectedInput as err:
+            raise ValueError(f"Could not parse '{value}' as a Gregorian date") from err
@@ -0,0 +1,10 @@
+from lark import Lark
+
+from undate.converters import GRAMMAR_FILE_PATH
+
+grammar_path = GRAMMAR_FILE_PATH / "gregorian.lark"
+
+# open based on filename to allow relative imports based on grammar file
+gregorian_parser = Lark.open(
+    str(grammar_path), rel_to=__file__, start="gregorian_date", strict=True
+)
@@ -0,0 +1,42 @@
+from lark import Transformer, Tree
+
+from undate import Undate, Calendar
+
+
+class GregorianDateTransformer(Transformer):
+    """Transform a Gregorian date parse tree and return an Undate."""
+
+    # Currently parser should not result in intervals
+
+    calendar = Calendar.GREGORIAN
+
+    def gregorian_date(self, items):
+        parts = {}
+        for child in items:
+            if child.data in ["year", "month", "day"]:
+                # in each case we expect one integer value;
+                # anonymous tokens convert to their value and cast as int
+                value = int(child.children[0])
+                parts[str(child.data)] = value
+
+        # initialize and return an undate with year, month, day and
+        # Gregorian calendar
+        return Undate(**parts, calendar=self.calendar)
+
+    def year(self, items):
+        # combine multiple parts into a single string
+        value = "".join([str(i) for i in items])
+        return Tree(data="year", children=[value])
+
+    def month(self, items):
+        # month has a nested tree for the rule and the value
+        # the name of the rule (month_1, month_2, etc) gives us the
+        # number of the month needed for converting the date
+        tree = items[0]
+        month_n = tree.data.split("_")[-1]
+        return Tree(data="month", children=[month_n])
+
+    def day(self, items):
+        # combine multiple parts into a single string
+        value = "".join([str(i) for i in items])
+        return Tree(data="day", children=[value])
@@ -1,7 +1,7 @@
 from typing import Union
 
 from convertdate import hebrew  # type: ignore
-from lark.exceptions import UnexpectedCharacters
+from lark.exceptions import UnexpectedInput
 
 from undate import Undate, UndateInterval
 from undate.converters.base import BaseCalendarConverter
@@ -111,7 +111,7 @@ def parse(self, value: str) -> Union[Undate, UndateInterval]:
             # set the original date as a label, with the calendar name
             undate_obj.label = f"{value} {self.calendar_name}"
             return undate_obj
-        except UnexpectedCharacters as err:
+        except UnexpectedInput as err:
             raise ValueError(f"Could not parse '{value}' as a Hebrew date") from err
 
     # do we need to support conversion the other direction?
 
@@ -4,6 +4,7 @@
 
 grammar_path = GRAMMAR_FILE_PATH / "hebrew.lark"
 
-with open(grammar_path) as grammar:
-    # NOTE: LALR parser is faster but can't be used to ambiguity between years and dates
-    hebrew_parser = Lark(grammar.read(), start="hebrew_date", strict=True)
+# open based on filename to allow relative imports based on grammar file
+hebrew_parser = Lark.open(
+    str(grammar_path), rel_to=__file__, start="hebrew_date", strict=True
+)
Original file line number	Diff line number	Diff line change
`@@ -0,0 +1,3 @@`
	`1`	`+from undate.converters.calendars.gregorian.converter import GregorianDateConverter`
	`2`	`+`
	`3`	`+__all__ = ["GregorianDateConverter"]`