Skip to content

Commit a2ffae6

Browse files
committed
WIP use Flags to mark doctype tokens
1 parent 444ef8a commit a2ffae6

2 files changed

Lines changed: 106 additions & 29 deletions

File tree

src/docstub/_doctype.py

Lines changed: 105 additions & 28 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,7 @@
11
"""Parsing of doctypes"""
22

3+
import enum
4+
import itertools
35
import logging
46
from collections.abc import Iterable
57
from dataclasses import dataclass
@@ -35,31 +37,61 @@ def insert_between(iterable, *, sep):
3537
return out[:-1]
3638

3739

40+
class TokenFlag(enum.Flag):
41+
# docstub: off
42+
NAME = enum.auto()
43+
NATLANG = enum.auto()
44+
SUBSCRIPT = enum.auto()
45+
LITERAL = enum.auto()
46+
GENERATOR = enum.auto()
47+
ARRAY = enum.auto()
48+
UNION = enum.auto()
49+
START = enum.auto()
50+
STOP = enum.auto()
51+
SEP = enum.auto()
52+
# docstub: on
53+
54+
@classmethod
55+
def _missing_(cls, value):
56+
forbidden = {
57+
*itertools.combinations([cls.START, cls.STOP, cls.SEP, cls.NAME], 2)
58+
}
59+
for pair in forbidden:
60+
if value is (pair[0].value | pair[1].value):
61+
raise ValueError(f"{pair[0].name}|{pair[1].name} not allowed")
62+
return super()._missing_(value)
63+
64+
3865
class Token(str):
3966
"""A token representing an atomic part of a doctype."""
4067

41-
__slots__ = ("value", "kind")
68+
flag = TokenFlag
69+
70+
__slots__ = ("value", "kind", "pos")
4271

43-
def __new__(cls, value, *, kind):
72+
def __new__(cls, value, *, kind, pos=None):
4473
self = super().__new__(cls, value)
45-
self.kind = kind
74+
self.kind = TokenFlag(kind)
75+
self.pos = pos
4676
return self
4777

4878
def __repr__(self):
4979
return f"{type(self).__name__}('{self}', kind={self.kind!r})"
5080

5181
@classmethod
5282
def find_iter(cls, iterable, *, kind):
83+
kind = TokenFlag(kind)
5384
for item in flatten_recursive(iterable):
54-
if isinstance(item, cls) and item.kind == kind:
85+
if isinstance(item, cls) and all(k & item.kind for k in kind):
5586
yield item
5687

5788
@classmethod
5889
def find_one(cls, iterable, *, kind):
5990
matching = list(cls.find_iter(iterable, kind=kind))
6091
if len(matching) != 1:
6192
msg = (
62-
f"expected exactly one {cls.__name__} with {kind=}, got {len(matching)}"
93+
f"expected exactly one {cls.__name__} with {kind=}, "
94+
f"got {len(matching)}: {matching}"
6395
)
6496
raise ValueError(msg)
6597
return matching[0]
@@ -79,7 +111,11 @@ def qualname(self, tree):
79111
"""
80112
children = tree.children
81113
_qualname = ".".join(children)
82-
_qualname = Token(_qualname, kind="qualname")
114+
_qualname = Token(
115+
_qualname,
116+
kind=Token.flag.NAME,
117+
pos=(tree.meta.start_pos, tree.meta.end_pos),
118+
)
83119
return _qualname
84120

85121
def rst_role(self, tree):
@@ -92,7 +128,7 @@ def rst_role(self, tree):
92128
-------
93129
out : lark.Token
94130
"""
95-
qualname = Token.find_one(tree.children, kind="qualname")
131+
qualname = Token.find_one(tree.children, kind=Token.flag.NAME)
96132
return qualname
97133

98134
def union(self, tree):
@@ -105,7 +141,7 @@ def union(self, tree):
105141
-------
106142
out : list[str]
107143
"""
108-
sep = Token(" | ", kind="union_sep")
144+
sep = Token(" | ", kind=Token.flag.UNION | Token.flag.SEP)
109145
out = insert_between(tree.children, sep=sep)
110146
return out
111147

@@ -119,7 +155,7 @@ def subscription(self, tree):
119155
-------
120156
out : str
121157
"""
122-
return self._format_subscription(tree.children, name="subscription")
158+
return self._format_subscription(tree.children)
123159

124160
def natlang_literal(self, tree):
125161
"""
@@ -131,8 +167,13 @@ def natlang_literal(self, tree):
131167
-------
132168
out : str
133169
"""
134-
items = [Token("Literal", kind="qualname"), *tree.children]
135-
out = self._format_subscription(items, "nl_literal")
170+
items = [
171+
Token("Literal", kind=Token.flag.LITERAL | Token.flag.NAME),
172+
*tree.children,
173+
]
174+
out = self._format_subscription(
175+
items, kind=Token.flag.LITERAL | Token.flag.NATLANG
176+
)
136177

137178
if len(tree.children) == 1:
138179
logger.warning(
@@ -143,6 +184,14 @@ def natlang_literal(self, tree):
143184
)
144185
return out
145186

187+
def literal_item(self, tree):
188+
item, *other = tree.children
189+
assert not other
190+
kind = Token.flag.LITERAL
191+
if isinstance(item, Token):
192+
kind |= item.kind
193+
return Token(item, kind=kind, pos=(tree.meta.start_pos, tree.meta.end_pos))
194+
146195
def natlang_container(self, tree):
147196
"""
148197
Parameters
@@ -153,7 +202,7 @@ def natlang_container(self, tree):
153202
-------
154203
out : str
155204
"""
156-
return self._format_subscription(tree.children, name="nl_container")
205+
return self._format_subscription(tree.children, kind=Token.flag.NATLANG)
157206

158207
def natlang_array(self, tree):
159208
"""
@@ -165,11 +214,15 @@ def natlang_array(self, tree):
165214
-------
166215
out : str
167216
"""
168-
array_name = Token.find_one(tree.children, kind="array_name")
217+
array_name = Token.find_one(
218+
tree.children, kind=Token.flag.ARRAY | Token.flag.NAME
219+
)
169220
items = tree.children.copy()
170221
items.remove(array_name)
171-
items.insert(0, Token(array_name, kind="qualname"))
172-
return self._format_subscription(items, name="nl_array")
222+
items.insert(0, array_name)
223+
return self._format_subscription(
224+
items, kind=Token.flag.ARRAY | Token.flag.NATLANG
225+
)
173226

174227
def array_name(self, tree):
175228
"""
@@ -186,7 +239,7 @@ def array_name(self, tree):
186239
# This currently relies on a hack that only allows specific names
187240
# in `array_expression` (see `ARRAY_NAME` terminal in gramar)
188241
qualname = self.qualname(tree)
189-
qualname = Token(qualname, kind="array_name")
242+
qualname = Token(qualname, kind=Token.flag.NAME | Token.flag.ARRAY)
190243
return qualname
191244

192245
def shape(self, tree):
@@ -228,22 +281,24 @@ def extra_info(self, tree):
228281
logger.debug("dropping extra info")
229282
return lark.Discard
230283

231-
def _format_subscription(self, sequence, name):
232-
sep = Token(", ", kind=f"{name}_sep")
284+
def _format_subscription(self, sequence, kind=None):
285+
if kind is None:
286+
kind = Token.flag.SUBSCRIPT
287+
else:
288+
kind |= Token.flag.SUBSCRIPT
289+
290+
sep = Token(", ", kind=kind | Token.flag.SEP)
233291
container, *content = sequence
234292
content = insert_between(content, sep=sep)
235293
assert content
236294
out = [
237295
container,
238-
Token("[", kind=f"{name}_start"),
296+
Token("[", kind=kind | Token.flag.START),
239297
*content,
240-
Token("]", kind=f"{name}_stop"),
298+
Token("]", kind=kind | Token.flag.STOP),
241299
]
242300
return out
243301

244-
def __default_token__(self, token):
245-
return Token(token.value, kind=token.type.lower())
246-
247302

248303
@dataclass(frozen=True, slots=True)
249304
class ParsedDoctype:
@@ -265,16 +320,38 @@ def parse(cls, doctype):
265320
266321
Examples
267322
--------
268-
>>> ParsedDoctype.parse("tuple of int or ndarray of dtype (float or int)")
323+
>>> doctype = ParsedDoctype.parse(
324+
... "tuple of int or ndarray of dtype (float or int)"
325+
... )
326+
>>> doctype
269327
<ParsedDoctype: 'tuple[int] | ndarray[float | int]'>
328+
>>> doctype.qualnames
329+
(Token('tuple', kind='qualname'),
330+
Token('int', kind='qualname'),
331+
Token('ndarray', kind='qualname'),
332+
Token('float', kind='qualname'),
333+
Token('int', kind='qualname'))
270334
"""
271335
tree = _lark.parse(doctype)
272-
result = DoctypeTransformer().transform(tree=tree)
273-
result = tuple(flatten_recursive(result))
274-
return cls(result, raw_doctype=doctype)
336+
tokens = DoctypeTransformer().transform(tree=tree)
337+
tokens = tuple(flatten_recursive(tokens))
338+
return cls(tokens, raw_doctype=doctype)
275339

276340
def __str__(self):
277341
return "".join(self.tokens)
278342

279343
def __repr__(self):
280-
return f"<{type(self).__name__}: '{self}'>"
344+
return f"<{type(self).__name__} '{self}'>"
345+
346+
@property
347+
def qualnames(self):
348+
return tuple(Token.find_iter(self.tokens, kind=Token.flag.NAME))
349+
350+
def print_map_tokens_to_raw(self):
351+
for token in self.tokens:
352+
if token.pos is not None:
353+
start, stop = token.pos
354+
print(self.raw_doctype)
355+
print(" " * start + "^" * (stop - start))
356+
print(" " * start + token)
357+
print()

src/docstub/doctype.lark

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -58,7 +58,7 @@ natlang_literal: "{" literal_item ("," literal_item)* "}"
5858

5959
// An single item in a literal expression (or `optional`). We must also allow
6060
// for qualified names, since a "class" or enum can be used as a literal too.
61-
?literal_item: ELLIPSES | STRING | SIGNED_NUMBER | qualname
61+
literal_item: ELLIPSES | STRING | SIGNED_NUMBER | qualname
6262

6363

6464
// Natural language forms of the subscription expression for containers.

0 commit comments

Comments
 (0)