11"""Parsing of doctypes"""
22
3+ import enum
4+ import itertools
35import logging
46from collections .abc import Iterable
57from dataclasses import dataclass
@@ -35,31 +37,61 @@ def insert_between(iterable, *, sep):
3537 return out [:- 1 ]
3638
3739
40+ class TokenFlag (enum .Flag ):
41+ # docstub: off
42+ NAME = enum .auto ()
43+ NATLANG = enum .auto ()
44+ SUBSCRIPT = enum .auto ()
45+ LITERAL = enum .auto ()
46+ GENERATOR = enum .auto ()
47+ ARRAY = enum .auto ()
48+ UNION = enum .auto ()
49+ START = enum .auto ()
50+ STOP = enum .auto ()
51+ SEP = enum .auto ()
52+ # docstub: on
53+
54+ @classmethod
55+ def _missing_ (cls , value ):
56+ forbidden = {
57+ * itertools .combinations ([cls .START , cls .STOP , cls .SEP , cls .NAME ], 2 )
58+ }
59+ for pair in forbidden :
60+ if value is (pair [0 ].value | pair [1 ].value ):
61+ raise ValueError (f"{ pair [0 ].name } |{ pair [1 ].name } not allowed" )
62+ return super ()._missing_ (value )
63+
64+
3865class Token (str ):
3966 """A token representing an atomic part of a doctype."""
4067
41- __slots__ = ("value" , "kind" )
68+ flag = TokenFlag
69+
70+ __slots__ = ("value" , "kind" , "pos" )
4271
43- def __new__ (cls , value , * , kind ):
72+ def __new__ (cls , value , * , kind , pos = None ):
4473 self = super ().__new__ (cls , value )
45- self .kind = kind
74+ self .kind = TokenFlag (kind )
75+ self .pos = pos
4676 return self
4777
4878 def __repr__ (self ):
4979 return f"{ type (self ).__name__ } ('{ self } ', kind={ self .kind !r} )"
5080
5181 @classmethod
5282 def find_iter (cls , iterable , * , kind ):
83+ kind = TokenFlag (kind )
5384 for item in flatten_recursive (iterable ):
54- if isinstance (item , cls ) and item .kind == kind :
85+ if isinstance (item , cls ) and all ( k & item .kind for k in kind ) :
5586 yield item
5687
5788 @classmethod
5889 def find_one (cls , iterable , * , kind ):
5990 matching = list (cls .find_iter (iterable , kind = kind ))
6091 if len (matching ) != 1 :
6192 msg = (
62- f"expected exactly one { cls .__name__ } with { kind = } , got { len (matching )} "
93+ f"expected exactly one { cls .__name__ } with { kind = } , "
94+ f"got { len (matching )} : { matching } "
6395 )
6496 raise ValueError (msg )
6597 return matching [0 ]
@@ -79,7 +111,11 @@ def qualname(self, tree):
79111 """
80112 children = tree .children
81113 _qualname = "." .join (children )
82- _qualname = Token (_qualname , kind = "qualname" )
114+ _qualname = Token (
115+ _qualname ,
116+ kind = Token .flag .NAME ,
117+ pos = (tree .meta .start_pos , tree .meta .end_pos ),
118+ )
83119 return _qualname
84120
85121 def rst_role (self , tree ):
@@ -92,7 +128,7 @@ def rst_role(self, tree):
92128 -------
93129 out : lark.Token
94130 """
95- qualname = Token .find_one (tree .children , kind = "qualname" )
131+ qualname = Token .find_one (tree .children , kind = Token . flag . NAME )
96132 return qualname
97133
98134 def union (self , tree ):
@@ -105,7 +141,7 @@ def union(self, tree):
105141 -------
106142 out : list[str]
107143 """
108- sep = Token (" | " , kind = "union_sep" )
144+ sep = Token (" | " , kind = Token . flag . UNION | Token . flag . SEP )
109145 out = insert_between (tree .children , sep = sep )
110146 return out
111147
@@ -119,7 +155,7 @@ def subscription(self, tree):
119155 -------
120156 out : str
121157 """
122- return self ._format_subscription (tree .children , name = "subscription" )
158+ return self ._format_subscription (tree .children )
123159
124160 def natlang_literal (self , tree ):
125161 """
@@ -131,8 +167,13 @@ def natlang_literal(self, tree):
131167 -------
132168 out : str
133169 """
134- items = [Token ("Literal" , kind = "qualname" ), * tree .children ]
135- out = self ._format_subscription (items , "nl_literal" )
170+ items = [
171+ Token ("Literal" , kind = Token .flag .LITERAL | Token .flag .NAME ),
172+ * tree .children ,
173+ ]
174+ out = self ._format_subscription (
175+ items , kind = Token .flag .LITERAL | Token .flag .NATLANG
176+ )
136177
137178 if len (tree .children ) == 1 :
138179 logger .warning (
@@ -143,6 +184,14 @@ def natlang_literal(self, tree):
143184 )
144185 return out
145186
187+ def literal_item (self , tree ):
188+ item , * other = tree .children
189+ assert not other
190+ kind = Token .flag .LITERAL
191+ if isinstance (item , Token ):
192+ kind |= item .kind
193+ return Token (item , kind = kind , pos = (tree .meta .start_pos , tree .meta .end_pos ))
194+
146195 def natlang_container (self , tree ):
147196 """
148197 Parameters
@@ -153,7 +202,7 @@ def natlang_container(self, tree):
153202 -------
154203 out : str
155204 """
156- return self ._format_subscription (tree .children , name = "nl_container" )
205+ return self ._format_subscription (tree .children , kind = Token . flag . NATLANG )
157206
158207 def natlang_array (self , tree ):
159208 """
@@ -165,11 +214,15 @@ def natlang_array(self, tree):
165214 -------
166215 out : str
167216 """
168- array_name = Token .find_one (tree .children , kind = "array_name" )
217+ array_name = Token .find_one (
218+ tree .children , kind = Token .flag .ARRAY | Token .flag .NAME
219+ )
169220 items = tree .children .copy ()
170221 items .remove (array_name )
171- items .insert (0 , Token (array_name , kind = "qualname" ))
172- return self ._format_subscription (items , name = "nl_array" )
222+ items .insert (0 , array_name )
223+ return self ._format_subscription (
224+ items , kind = Token .flag .ARRAY | Token .flag .NATLANG
225+ )
173226
174227 def array_name (self , tree ):
175228 """
@@ -186,7 +239,7 @@ def array_name(self, tree):
186239 # This currently relies on a hack that only allows specific names
187240 # in `array_expression` (see `ARRAY_NAME` terminal in gramar)
188241 qualname = self .qualname (tree )
189- qualname = Token (qualname , kind = "array_name" )
242+ qualname = Token (qualname , kind = Token . flag . NAME | Token . flag . ARRAY )
190243 return qualname
191244
192245 def shape (self , tree ):
@@ -228,22 +281,24 @@ def extra_info(self, tree):
228281 logger .debug ("dropping extra info" )
229282 return lark .Discard
230283
231- def _format_subscription (self , sequence , name ):
232- sep = Token (", " , kind = f"{ name } _sep" )
284+ def _format_subscription (self , sequence , kind = None ):
285+ if kind is None :
286+ kind = Token .flag .SUBSCRIPT
287+ else :
288+ kind |= Token .flag .SUBSCRIPT
289+
290+ sep = Token (", " , kind = kind | Token .flag .SEP )
233291 container , * content = sequence
234292 content = insert_between (content , sep = sep )
235293 assert content
236294 out = [
237295 container ,
238- Token ("[" , kind = f" { name } _start" ),
296+ Token ("[" , kind = kind | Token . flag . START ),
239297 * content ,
240- Token ("]" , kind = f" { name } _stop" ),
298+ Token ("]" , kind = kind | Token . flag . STOP ),
241299 ]
242300 return out
243301
244- def __default_token__ (self , token ):
245- return Token (token .value , kind = token .type .lower ())
246-
247302
248303@dataclass (frozen = True , slots = True )
249304class ParsedDoctype :
@@ -265,16 +320,38 @@ def parse(cls, doctype):
265320
266321 Examples
267322 --------
268- >>> ParsedDoctype.parse("tuple of int or ndarray of dtype (float or int)")
323+ >>> doctype = ParsedDoctype.parse(
324+ ... "tuple of int or ndarray of dtype (float or int)"
325+ ... )
326+ >>> doctype
269327 <ParsedDoctype: 'tuple[int] | ndarray[float | int]'>
328+ >>> doctype.qualnames
329+ (Token('tuple', kind='qualname'),
330+ Token('int', kind='qualname'),
331+ Token('ndarray', kind='qualname'),
332+ Token('float', kind='qualname'),
333+ Token('int', kind='qualname'))
270334 """
271335 tree = _lark .parse (doctype )
272- result = DoctypeTransformer ().transform (tree = tree )
273- result = tuple (flatten_recursive (result ))
274- return cls (result , raw_doctype = doctype )
336+ tokens = DoctypeTransformer ().transform (tree = tree )
337+ tokens = tuple (flatten_recursive (tokens ))
338+ return cls (tokens , raw_doctype = doctype )
275339
276340 def __str__ (self ):
277341 return "" .join (self .tokens )
278342
279343 def __repr__ (self ):
280- return f"<{ type (self ).__name__ } : '{ self } '>"
344+ return f"<{ type (self ).__name__ } '{ self } '>"
345+
346+ @property
347+ def qualnames (self ):
348+ return tuple (Token .find_iter (self .tokens , kind = Token .flag .NAME ))
349+
350+ def print_map_tokens_to_raw (self ):
351+ for token in self .tokens :
352+ if token .pos is not None :
353+ start , stop = token .pos
354+ print (self .raw_doctype )
355+ print (" " * start + "^" * (stop - start ))
356+ print (" " * start + token )
357+ print ()
0 commit comments