starlasu-python/pylasu/parsing/antlr.py at f4b83d0434bb5cb51168aad35e516df52ebda64b · Strumenta/starlasu-python · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
import time
from abc import abstractmethod
from typing import Optional, List, Union

from antlr4 import CommonTokenStream, InputStream, Lexer, Parser, ParserATNSimulator, ParserRuleContext, \
    PredictionContextCache, Recognizer, Token, TokenStream
from antlr4.error.ErrorListener import ErrorListener
from pylasu.model import walk, Source, Position, Node, Point
from pylasu.model.processing import assign_parents
from pylasu.parsing.parse_tree import token_end_point
from pylasu.parsing.results import ParsingResultWithFirstStage, FirstStageParsingResult
from pylasu.validation import Issue, IssueType


class PylasuANTLRParser:
    """ A complete description of a multi-stage ANTLR-based parser, from source code to AST.

    You should extend this class to implement the parts that are specific to your language.

    Note: instances of this class are thread-safe and they're meant to be reused. Do not create a new PylasuANTLRParser
    instance every time you need to parse some source code, or performance may suffer."""
    def __init__(self):
        self.prediction_context_cache = PredictionContextCache()

    def parse(self, input_stream: Union[InputStream, str], consider_range: bool = True,
              measure_lexing_time: bool = False, source: Optional[Source] = None):
        """Parses source code, returning a result that includes an AST and a collection of parse issues
        (errors, warnings).
        The parsing is done in accordance to the StarLasu methodology i.e. a first-stage parser builds a parse tree
        which is then mapped onto a higher-level tree called the AST.
        @param inputStream the source code.
        @param charset the character set in which the input is encoded.
        @param considerPosition if true (the default), parsed AST nodes record their position in the input text.
        @param measureLexingTime if true, the result will include a measurement of the time spent in lexing i.e.
        breaking the input stream into tokens."""
        start = time.time_ns()
        if type(input_stream) is str:
            input_stream = InputStream(input_stream)
        first_stage = self.parse_first_stage(input_stream, measure_lexing_time)
        issues = first_stage.issues
        ast = self.parse_tree_to_ast(first_stage.root, consider_range, issues, source)
        self.assign_parents(ast)
        ast = self.post_process_ast(ast, issues) if ast else ast
        if ast and not consider_range:
            # Remove parseTreeNodes because they cause the range to be computed
            for node in walk(ast):
                node.origin = None
        now = time.time_ns()
        return ParsingResultWithFirstStage(
            issues,
            ast,
            input_stream.getText(0, input_stream.index + 1),
            (now - start) // 1_000_000,
            first_stage,
            source,
        )

    def parse_first_stage(
            self, input_stream: InputStream, measure_lexing_time: bool = False, source: Source = None
    ) -> FirstStageParsingResult:
        """Executes only the first stage of the parser, i.e., the production of a parse tree. Usually, you'll want to
        use the [parse] method, that returns an AST which is simpler to use and query."""
        issues = []
        lexing_time: Optional[int] = None
        total_time: int = time.time_ns()
        parser = self.create_parser(input_stream, issues)
        if measure_lexing_time:
            token_stream = parser.getInputStream()
            if isinstance(token_stream, CommonTokenStream):
                lexing_time = time.time_ns()
                token_stream.fill()
                token_stream.seek(0)
                lexing_time = (time.time_ns() - lexing_time) // 1_000_000
        root = self.invoke_root_rule(parser)
        if root:
            self.verify_parse_tree(parser, issues, root)
        total_time = (time.time_ns() - total_time) // 1_000_000
        return FirstStageParsingResult(issues, root, None, total_time, lexing_time, source)

    def create_parser(self, input_stream: InputStream, issues: List[Issue]) -> Parser:
        """Creates the first-stage parser."""
        lexer = self.create_antlr_lexer(input_stream)
        self.attach_listeners(lexer, issues)
        token_stream = self.create_token_stream(lexer)
        parser = self.create_antlr_parser(token_stream)
        # Assign interpreter to avoid caching DFA states indefinitely across executions
        parser._interp = \
            ParserATNSimulator(parser, parser.atn, parser._interp.decisionToDFA, self.prediction_context_cache)
        self.attach_listeners(parser, issues)
        return parser

    def invoke_root_rule(self, parser: Parser):
        """Invokes the parser's root rule, i.e., the method which is responsible for parsing the entire input.
        Usually this is the topmost rule, the one with index 0 (as also assumed by other libraries such as antlr4-c3),
        so this method invokes that rule. If your grammar/parser is structured differently, or if you're using this to
        parse only a portion of the input or a subset of the language, you have to override this method to invoke the
        correct entry point."""
        return getattr(parser, parser.ruleNames[0])()

    def verify_parse_tree(self, parser: Parser, issues: List[Issue], root: ParserRuleContext):
        """Checks the parse tree for correctness.
        If you're concerned about performance, you may want to override this to do nothing."""
        last_token: Token = parser.getTokenStream().get(parser.getTokenStream().index)
        if last_token.type != Token.EOF:
            issues.append(
                Issue(
                    IssueType.SYNTACTIC,
                    "The whole input was not consumed",
                    position=Position(token_end_point(last_token), token_end_point(last_token))
                )
            )
        # TODO Kolasu also traverses the parse tree searching for exceptions

    def assign_parents(self, ast):
        if ast:
            assign_parents(ast)

    def post_process_ast(self, ast, issues):
        return ast

    def create_token_stream(self, lexer: Lexer) -> TokenStream:
        return CommonTokenStream(lexer)

    @abstractmethod
    def create_antlr_lexer(self, input_stream: InputStream) -> Lexer:
        """Creates the lexer."""
        pass

    @abstractmethod
    def create_antlr_parser(self, token_stream: TokenStream) -> Parser:
        """Creates the first-stage parser."""
        pass

    @abstractmethod
    def parse_tree_to_ast(self, root, consider_range: bool, issues: List[Issue], source: Source) -> Optional[Node]:
        pass

    def attach_listeners(self, recognizer: Recognizer, issues: List[Issue]):
        recognizer.removeErrorListeners()
        recognizer.addErrorListener(ParserErrorListener(issues))


class ParserErrorListener(ErrorListener):
    def __init__(self, issues: List[Issue]):
        self.issues = issues

    def syntaxError(self, recognizer, offending_symbol, line, column, msg, e):
        start_point = Point(line, column)
        end_point = start_point
        if isinstance(offending_symbol, Token):
            end_point = token_end_point(offending_symbol)
        msg = (msg or "unspecified").capitalize()
        self.issues.append(Issue(IssueType.SYNTACTIC, msg, position=Position(start_point, end_point)))