From 4468a129efc691aaef6579cf8b59078dab8a7edb Mon Sep 17 00:00:00 2001 From: MegaIng Date: Mon, 20 Nov 2023 16:50:43 +0100 Subject: [PATCH 1/8] Added basic `.scan` function to `Lark` --- lark/lark.py | 47 +++++++++++++++++++++++++++++++++++++++++++++++ lark/lexer.py | 13 +++++++++++++ 2 files changed, 60 insertions(+) diff --git a/lark/lark.py b/lark/lark.py index 6d34aa62..764ddd31 100644 --- a/lark/lark.py +++ b/lark/lark.py @@ -657,5 +657,52 @@ def parse(self, text: str, start: Optional[str]=None, on_error: 'Optional[Callab """ return self.parser.parse(text, start=start, on_error=on_error) + def scan(self, text: str, start: Optional[str]=None) -> Iterator[Tuple[Tuple[int, int], 'ParseTree']]: + """ + Scans the input text for non-overlapping matches of the rule specified by 'start' and + yields the start and end position as well as the resulting tree. + + Only works with parser='lalr' and lexer='contextual'. Works best if the first terminal(s) + that can be matched by grammar are unique in the text and always indicate the start of a match. + + Does not raise any exceptions except for invalid arguments/configurations. + + """ + if self.options.parser != 'lalr' or self.options.lexer != 'contextual': + raise ValueError("scan requires parser='lalr' and lexer='contextual'") + start_states = self.parser.parser._parse_table.start_states + if start is None: + if len(start_states) != 1: + raise ValueError("Need to specify start") + start, = start_states + start_state = start_states[start] + start_lex: BasicLexer = self.parser.lexer.lexers[start_state] + pos = 0 + while True: + start_pos = start_lex.scanner.search(text, pos) + if start_pos is None: + break + valid_end = [] + ip = self.parse_interactive(text[start_pos:], start=start) + tokens = ip.lexer_thread.lex(ip.parser_state) + while True: + try: + token = next(tokens) + ip.feed_token(token) + except (UnexpectedInput, StopIteration): + break + if '$END' in ip.choices(): + valid_end.append((token, ip.copy())) + for (last, pot) in valid_end[::-1]: + try: + res = pot.feed_eof(last) + except UnexpectedInput: + continue + else: + yield ((start_pos, start_pos + last.end_pos), res) + pos = start_pos + last.end_pos + break + else: + pos = start_pos + 1 ###} diff --git a/lark/lexer.py b/lark/lexer.py index 9061d600..cadb244b 100644 --- a/lark/lexer.py +++ b/lark/lexer.py @@ -390,6 +390,19 @@ def match(self, text, pos): if m: return m.group(0), m.lastgroup + def search(self, text, pos): + best = None, float("inf") + for mre in self._mres: + mre: re.Pattern + m = mre.search(text, pos) + if m: + if m.start() < best[1]: + best = (m.group(0), m.lastgroup), m.start() + if best[0] is None: + return None + else: + return best[1] + def _regexp_has_newline(r: str): r"""Expressions that may indicate newlines in a regexp: From c3fc43e451e65bff2a760b71b654854c0be9a3ae Mon Sep 17 00:00:00 2001 From: MegaIng Date: Wed, 19 Jun 2024 18:43:47 +0200 Subject: [PATCH 2/8] Added start_pos and end_pos to lex,parse,parse_interactive --- lark/lark.py | 17 +++++++----- lark/lexer.py | 60 +++++++++++++++++++++++++++------------- lark/parser_frontends.py | 28 ++++++++++++++----- tests/test_lexer.py | 13 +++++++++ tests/test_parser.py | 24 +++++++++++++++- 5 files changed, 108 insertions(+), 34 deletions(-) diff --git a/lark/lark.py b/lark/lark.py index 764ddd31..5defd4a2 100644 --- a/lark/lark.py +++ b/lark/lark.py @@ -600,8 +600,8 @@ def open_from_package(cls: Type[_T], package: str, grammar_path: str, search_pat def __repr__(self): return 'Lark(open(%r), parser=%r, lexer=%r, ...)' % (self.source_path, self.options.parser, self.options.lexer) - - def lex(self, text: str, dont_ignore: bool=False) -> Iterator[Token]: + def lex(self, text: str, dont_ignore: bool = False, *, start_pos: Optional[int] = None, + end_pos: Optional[int] = None) -> Iterator[Token]: """Only lex (and postlex) the text, without parsing it. Only relevant when lexer='basic' When dont_ignore=True, the lexer will return all tokens, even those marked for %ignore. @@ -613,7 +613,7 @@ def lex(self, text: str, dont_ignore: bool=False) -> Iterator[Token]: lexer = self._build_lexer(dont_ignore) else: lexer = self.lexer - lexer_thread = LexerThread.from_text(lexer, text) + lexer_thread = LexerThread.from_text(lexer, text, start_pos=start_pos, end_pos=end_pos) stream = lexer_thread.lex(None) if self.options.postlex: return self.options.postlex.process(stream) @@ -623,7 +623,8 @@ def get_terminal(self, name: str) -> TerminalDef: """Get information about a terminal""" return self._terminals_dict[name] - def parse_interactive(self, text: Optional[str]=None, start: Optional[str]=None) -> 'InteractiveParser': + def parse_interactive(self, text: Optional[str] = None, start: Optional[str] = None, + *, start_pos: Optional[int] = None, end_pos: Optional[int] = None) -> 'InteractiveParser': """Start an interactive parsing session. Parameters: @@ -635,9 +636,11 @@ def parse_interactive(self, text: Optional[str]=None, start: Optional[str]=None) See Also: ``Lark.parse()`` """ - return self.parser.parse_interactive(text, start=start) + return self.parser.parse_interactive(text, start=start, start_pos=start_pos, end_pos=end_pos) - def parse(self, text: str, start: Optional[str]=None, on_error: 'Optional[Callable[[UnexpectedInput], bool]]'=None) -> 'ParseTree': + def parse(self, text: str, start: Optional[str] = None, + on_error: 'Optional[Callable[[UnexpectedInput], bool]]' = None, + *, start_pos: Optional[int] = None, end_pos: Optional[int] = None) -> 'ParseTree': """Parse the given text, according to the options provided. Parameters: @@ -655,7 +658,7 @@ def parse(self, text: str, start: Optional[str]=None, on_error: 'Optional[Callab For convenience, these sub-exceptions also inherit from ``ParserError`` and ``LexerError``. """ - return self.parser.parse(text, start=start, on_error=on_error) + return self.parser.parse(text, start=start, on_error=on_error, start_pos=start_pos, end_pos=end_pos) def scan(self, text: str, start: Optional[str]=None) -> Iterator[Tuple[Tuple[int, int], 'ParseTree']]: """ diff --git a/lark/lexer.py b/lark/lexer.py index cadb244b..a0aedee3 100644 --- a/lark/lexer.py +++ b/lark/lexer.py @@ -1,11 +1,11 @@ # Lexer Implementation - +import sys from abc import abstractmethod, ABC import re from contextlib import suppress from typing import ( TypeVar, Type, Dict, Iterator, Collection, Callable, Optional, FrozenSet, Any, - ClassVar, TYPE_CHECKING, overload + ClassVar, TYPE_CHECKING, overload, Tuple ) from types import ModuleType import warnings @@ -289,7 +289,7 @@ def __eq__(self, other): return self.char_pos == other.char_pos and self.newline_char == other.newline_char - def feed(self, token: Token, test_newline=True): + def feed(self, token: str, test_newline=True): """Consume a token and calculate the new line & column. As an optional optimization, set test_newline=False if token doesn't contain a newline. @@ -384,20 +384,20 @@ def _build_mres(self, terminals, max_size): terminals = terminals[max_size:] return mres - def match(self, text, pos): + def match(self, text, pos, *, end_pos=sys.maxsize): for mre in self._mres: - m = mre.match(text, pos) + m = mre.match(text, pos, end_pos) if m: return m.group(0), m.lastgroup - def search(self, text, pos): + def search(self, text, start_pos, end_pos): best = None, float("inf") for mre in self._mres: mre: re.Pattern - m = mre.search(text, pos) + m = mre.search(text, start_pos, end_pos) if m: if m.start() < best[1]: - best = (m.group(0), m.lastgroup), m.start() + best = m.lastgroup, m.start() if best[0] is None: return None else: @@ -420,25 +420,46 @@ class LexerState: (Lexer objects are only instantiated per grammar, not per text) """ - __slots__ = 'text', 'line_ctr', 'last_token' + __slots__ = 'text', 'line_ctr', 'end_pos', 'last_token' - text: str + text: Optional[str] line_ctr: LineCounter + end_pos: int last_token: Optional[Token] - def __init__(self, text: str, line_ctr: Optional[LineCounter]=None, last_token: Optional[Token]=None): + def __init__(self, text: Optional[str], line_ctr: Optional[LineCounter] = None, last_token: Optional[Token] = None, + *, start_pos: Optional[int] = None, end_pos: Optional[int] = None): self.text = text self.line_ctr = line_ctr or LineCounter(b'\n' if isinstance(text, bytes) else '\n') self.last_token = last_token + # If we are not given a text (i.e. via `parse_interactive`), `start_pos` and `end_pos` are ignored + if self.text is None: + self.end_pos = sys.maxsize + return + if start_pos is not None: + if start_pos < 0: + start_pos += len(text) + # We don't call `.feed` here to avoid creating potentially gigantic copies of the text + self.line_ctr.char_pos = start_pos + self.line_ctr.line += text.count(self.line_ctr.newline_char, 0, start_pos) + if self.line_ctr.line != 1: + self.line_ctr.line_start_pos = text.rfind(self.line_ctr.newline_char, 0, start_pos) + self.line_ctr.column = self.line_ctr.char_pos - self.line_ctr.line_start_pos + 1 + self.end_pos = end_pos if end_pos is not None else len(self.text) + if self.end_pos < 0: + self.end_pos += len(text) def __eq__(self, other): if not isinstance(other, LexerState): return NotImplemented - return self.text is other.text and self.line_ctr == other.line_ctr and self.last_token == other.last_token + return (self.text is other.text and + self.line_ctr == other.line_ctr and + self.end_pos == other.end_pos and + self.last_token == other.last_token) def __copy__(self): - return type(self)(self.text, copy(self.line_ctr), self.last_token) + return type(self)(self.text, copy(self.line_ctr), self.last_token, end_pos=self.end_pos) class LexerThread: @@ -450,8 +471,9 @@ def __init__(self, lexer: 'Lexer', lexer_state: LexerState): self.state = lexer_state @classmethod - def from_text(cls, lexer: 'Lexer', text: str) -> 'LexerThread': - return cls(lexer, LexerState(text)) + def from_text(cls, lexer: 'Lexer', text: str, *, start_pos: Optional[int] = None, + end_pos: Optional[int] = None) -> 'LexerThread': + return cls(lexer, LexerState(text, start_pos=start_pos, end_pos=end_pos)) def lex(self, parser_state): return self.lexer.lex(self.state, parser_state) @@ -597,13 +619,13 @@ def scanner(self): self._build_scanner() return self._scanner - def match(self, text, pos): - return self.scanner.match(text, pos) + def match(self, text, pos, *, end_pos): + return self.scanner.match(text, pos, end_pos=end_pos) def next_token(self, lex_state: LexerState, parser_state: Any = None) -> Token: line_ctr = lex_state.line_ctr - while line_ctr.char_pos < len(lex_state.text): - res = self.match(lex_state.text, line_ctr.char_pos) + while line_ctr.char_pos < lex_state.end_pos: + res = self.match(lex_state.text, line_ctr.char_pos, end_pos=lex_state.end_pos) if not res: allowed = self.scanner.allowed_types - self.ignore_types if not allowed: diff --git a/lark/parser_frontends.py b/lark/parser_frontends.py index 186058a6..bb88d3c2 100644 --- a/lark/parser_frontends.py +++ b/lark/parser_frontends.py @@ -1,6 +1,6 @@ from typing import Any, Callable, Dict, Optional, Collection, Union, TYPE_CHECKING -from .exceptions import ConfigurationError, GrammarError, assert_config +from .exceptions import ConfigurationError, GrammarError, assert_config, UnexpectedInput from .utils import get_regexp_width, Serialize from .lexer import LexerThread, BasicLexer, ContextualLexer, Lexer from .parsers import earley, xearley, cyk @@ -22,8 +22,14 @@ def _wrap_lexer(lexer_class): class CustomLexerWrapper(Lexer): def __init__(self, lexer_conf): self.lexer = lexer_class(lexer_conf) + def lex(self, lexer_state, parser_state): + if lexer_state.line_ctr.char_pos != 0: + raise TypeError("Old Interface Custom Lexer don't support start_pos") + if lexer_state.end_pos != len(lexer_state.text): + raise TypeError("Old Interface Custom Lexer don't support end_pos") return self.lexer.lex(lexer_state.text) + return CustomLexerWrapper @@ -93,23 +99,31 @@ def _verify_start(self, start=None): raise ConfigurationError("Unknown start rule %s. Must be one of %r" % (start, self.parser_conf.start)) return start - def _make_lexer_thread(self, text: str) -> Union[str, LexerThread]: + def _make_lexer_thread(self, text: str, *, start_pos: Optional[int] = None, + end_pos: Optional[int] = None) -> Union[str, LexerThread]: cls = (self.options and self.options._plugins.get('LexerThread')) or LexerThread - return text if self.skip_lexer else cls.from_text(self.lexer, text) + if self.skip_lexer: + if start_pos is not None or end_pos is not None: + raise TypeError("lexer='dynamic' does not support start_pos/end_pos") + return text + return cls.from_text(self.lexer, text, start_pos=start_pos, end_pos=end_pos) - def parse(self, text: str, start=None, on_error=None): + def parse(self, text: str, start=None, on_error=None, *, start_pos=None, end_pos=None): chosen_start = self._verify_start(start) kw = {} if on_error is None else {'on_error': on_error} - stream = self._make_lexer_thread(text) + stream = self._make_lexer_thread(text, start_pos=start_pos, end_pos=end_pos) return self.parser.parse(stream, chosen_start, **kw) - def parse_interactive(self, text: Optional[str]=None, start=None): + def parse_interactive(self, text: Optional[str]=None, start=None, + *, start_pos: Optional[int] = None, end_pos: Optional[int] = None): # TODO BREAK - Change text from Optional[str] to text: str = ''. # Would break behavior of exhaust_lexer(), which currently raises TypeError, and after the change would just return [] + # When this is done, also adjust the code in `LexerState.__init__` since it currently works around being + # passed `None` with regard to start_pos and end_pos chosen_start = self._verify_start(start) if self.parser_conf.parser_type != 'lalr': raise ConfigurationError("parse_interactive() currently only works with parser='lalr' ") - stream = self._make_lexer_thread(text) # type: ignore[arg-type] + stream = self._make_lexer_thread(text, start_pos=start_pos, end_pos=end_pos) # type: ignore[arg-type] return self.parser.parse_interactive(stream, chosen_start) diff --git a/tests/test_lexer.py b/tests/test_lexer.py index 0996c897..9dbbfb69 100644 --- a/tests/test_lexer.py +++ b/tests/test_lexer.py @@ -2,6 +2,7 @@ from lark import Lark, Tree + class TestLexer(TestCase): def setUp(self): pass @@ -18,6 +19,18 @@ def test_basic(self): res = list(p.lex("abc cba dd", dont_ignore=True)) assert res == list('abc cba dd') + def test_subset_lex(self): + p = Lark(""" + start: "a" "b" "c" "d" + %ignore " " + """) + + res = list(p.lex("xxxabc cba ddxx", start_pos=3, end_pos=-2)) + assert res == list('abccbadd') + + res = list(p.lex("aaaabc cba dddd", start_pos=3, end_pos=-2)) + assert res == list('abccbadd') + if __name__ == '__main__': main() diff --git a/tests/test_parser.py b/tests/test_parser.py index 1ab705b0..9f3c80b9 100644 --- a/tests/test_parser.py +++ b/tests/test_parser.py @@ -912,6 +912,7 @@ def test_cycles_with_child_filter(self): self.assertEqual(tree, Tree('a', [Tree('x', [Tree('b', [])])])) + _NAME = "TestFullEarley" + LEXER.capitalize() _TestFullEarley.__name__ = _NAME globals()[_NAME] = _TestFullEarley @@ -2584,8 +2585,29 @@ def test_strict(self): """ self.assertRaises(GrammarError, _Lark, grammar, strict=True) + @unittest.skipIf(LEXER in ('dynamic', 'custom_old'), "start_pos and end_pos not compatible with old style custom/dynamic lexer ") + def test_subset_parse(self): + grammar = r""" + start: (WORD|FRAG_END|FRAG_START)+ + WORD: /\b\w+\b/ # match full word + FRAG_END: /\B\w+/ # end of a word, i.e. start is not at a word boundary + FRAG_START: /\w+\B/ # start of a word, i.e. end is not at a word boundary + %ignore /\s+/ + """ + + parser = _Lark(grammar) + self.assertEqual(parser.parse(" abc def ", start_pos=1, end_pos=-1), + Tree('start', [Token('WORD', 'abc'), Token('WORD', 'def')])) + self.assertEqual(parser.parse("xabc def ", start_pos=1, end_pos=-1), + Tree('start', [Token('FRAG_END', 'abc'), Token('WORD', 'def')])) + + # We match the behavior of python's re module here: It doesn't look ahead beyond `end_pos`, + # despite looking behind before `start_pos` + self.assertEqual(parser.parse(" abc defx", start_pos=1, end_pos=-1), + Tree('start', [Token('WORD', 'abc'), Token('WORD', 'def')])) + - _NAME = "Test" + PARSER.capitalize() + LEXER.capitalize() + _NAME = "TestParser" + PARSER.capitalize() + LEXER.capitalize() _TestParser.__name__ = _NAME _TestParser.__qualname__ = "tests.test_parser." + _NAME globals()[_NAME] = _TestParser From df3f48fc19c543df39268afee74b201021c7ef45 Mon Sep 17 00:00:00 2001 From: MegaIng Date: Thu, 20 Jun 2024 02:07:10 +0200 Subject: [PATCH 3/8] Rework scan, add tests --- lark/lark.py | 40 ++-------------------- lark/lexer.py | 61 ++++++++++++++++++++++++--------- lark/parser_frontends.py | 56 +++++++++++++++++++++++++++++- tests/__main__.py | 1 + tests/test_scan.py | 74 ++++++++++++++++++++++++++++++++++++++++ 5 files changed, 178 insertions(+), 54 deletions(-) create mode 100644 tests/test_scan.py diff --git a/lark/lark.py b/lark/lark.py index 5defd4a2..9d897ffb 100644 --- a/lark/lark.py +++ b/lark/lark.py @@ -660,7 +660,8 @@ def parse(self, text: str, start: Optional[str] = None, """ return self.parser.parse(text, start=start, on_error=on_error, start_pos=start_pos, end_pos=end_pos) - def scan(self, text: str, start: Optional[str]=None) -> Iterator[Tuple[Tuple[int, int], 'ParseTree']]: + def scan(self, text: str, start: Optional[str] = None, *, start_pos: Optional[int] = None, + end_pos: Optional[int] = None) -> Iterator[Tuple[Tuple[int, int], 'ParseTree']]: """ Scans the input text for non-overlapping matches of the rule specified by 'start' and yields the start and end position as well as the resulting tree. @@ -671,41 +672,6 @@ def scan(self, text: str, start: Optional[str]=None) -> Iterator[Tuple[Tuple[int Does not raise any exceptions except for invalid arguments/configurations. """ - if self.options.parser != 'lalr' or self.options.lexer != 'contextual': - raise ValueError("scan requires parser='lalr' and lexer='contextual'") - start_states = self.parser.parser._parse_table.start_states - if start is None: - if len(start_states) != 1: - raise ValueError("Need to specify start") - start, = start_states - start_state = start_states[start] - start_lex: BasicLexer = self.parser.lexer.lexers[start_state] - pos = 0 - while True: - start_pos = start_lex.scanner.search(text, pos) - if start_pos is None: - break - valid_end = [] - ip = self.parse_interactive(text[start_pos:], start=start) - tokens = ip.lexer_thread.lex(ip.parser_state) - while True: - try: - token = next(tokens) - ip.feed_token(token) - except (UnexpectedInput, StopIteration): - break - if '$END' in ip.choices(): - valid_end.append((token, ip.copy())) - for (last, pot) in valid_end[::-1]: - try: - res = pot.feed_eof(last) - except UnexpectedInput: - continue - else: - yield ((start_pos, start_pos + last.end_pos), res) - pos = start_pos + last.end_pos - break - else: - pos = start_pos + 1 + return self.parser.scan(text, start=start, start_pos=start_pos, end_pos=end_pos) ###} diff --git a/lark/lexer.py b/lark/lexer.py index a0aedee3..e4aa28e2 100644 --- a/lark/lexer.py +++ b/lark/lexer.py @@ -254,13 +254,17 @@ def new_borrow_pos(cls: Type[_T], type_: str, value: Any, borrow_t: 'Token') -> return cls(type_, value, borrow_t.start_pos, borrow_t.line, borrow_t.column, borrow_t.end_line, borrow_t.end_column, borrow_t.end_pos) def __reduce__(self): - return (self.__class__, (self.type, self.value, self.start_pos, self.line, self.column)) + return (self.__class__, (self.type, self.value, + self.start_pos, self.line, self.column, + self.end_line, self.end_column, self.end_pos)) def __repr__(self): return 'Token(%r, %r)' % (self.type, self.value) def __deepcopy__(self, memo): - return Token(self.type, self.value, self.start_pos, self.line, self.column) + return Token(self.type, self.value, + self.start_pos, self.line, self.column, + self.end_line, self.end_column, self.end_pos) def __eq__(self, other): if isinstance(other, Token) and self.type != other.type: @@ -303,6 +307,15 @@ def feed(self, token: str, test_newline=True): self.char_pos += len(token) self.column = self.char_pos - self.line_start_pos + 1 + def feed_substring(self, text: str, start_pos: int, end_pos: int): + newlines = text.count(self.newline_char, start_pos, end_pos) + if newlines: + self.line += newlines + self.line_start_pos = self.char_pos + text.rindex(self.newline_char, start_pos, end_pos) + 1 + + self.char_pos += end_pos - start_pos + self.column = self.char_pos - self.line_start_pos + 1 + class UnlessCallback: def __init__(self, scanner): @@ -397,11 +410,11 @@ def search(self, text, start_pos, end_pos): m = mre.search(text, start_pos, end_pos) if m: if m.start() < best[1]: - best = m.lastgroup, m.start() + best = (m.group(0), m.lastgroup), m.start() if best[0] is None: return None else: - return best[1] + return best def _regexp_has_newline(r: str): @@ -422,30 +435,25 @@ class LexerState: __slots__ = 'text', 'line_ctr', 'end_pos', 'last_token' - text: Optional[str] + text: str line_ctr: LineCounter end_pos: int last_token: Optional[Token] def __init__(self, text: Optional[str], line_ctr: Optional[LineCounter] = None, last_token: Optional[Token] = None, *, start_pos: Optional[int] = None, end_pos: Optional[int] = None): - self.text = text + self.text = text # type: ignore[assignment] self.line_ctr = line_ctr or LineCounter(b'\n' if isinstance(text, bytes) else '\n') self.last_token = last_token # If we are not given a text (i.e. via `parse_interactive`), `start_pos` and `end_pos` are ignored - if self.text is None: + if text is None: self.end_pos = sys.maxsize return + self.end_pos = end_pos if end_pos is not None else len(self.text) if start_pos is not None: if start_pos < 0: start_pos += len(text) - # We don't call `.feed` here to avoid creating potentially gigantic copies of the text - self.line_ctr.char_pos = start_pos - self.line_ctr.line += text.count(self.line_ctr.newline_char, 0, start_pos) - if self.line_ctr.line != 1: - self.line_ctr.line_start_pos = text.rfind(self.line_ctr.newline_char, 0, start_pos) - self.line_ctr.column = self.line_ctr.char_pos - self.line_ctr.line_start_pos + 1 - self.end_pos = end_pos if end_pos is not None else len(self.text) + self.line_ctr.feed_substring(text, 0, start_pos) if self.end_pos < 0: self.end_pos += len(text) @@ -496,6 +504,9 @@ class Lexer(ABC): def lex(self, lexer_state: LexerState, parser_state: Any) -> Iterator[Token]: return NotImplemented + def search_start(self, text: str, start_state, start_pos: int, end_pos: int) -> Optional[Token]: + raise TypeError("This lexer can not be used for searching in text") + def make_lexer_state(self, text): "Deprecated" return LexerState(text) @@ -598,7 +609,7 @@ def __init__(self, conf: 'LexerConf', comparator=None) -> None: self.use_bytes = conf.use_bytes self.terminals_by_name = conf.terminals_by_name - self._scanner = None + self._scanner: Optional[Scanner] = None def _build_scanner(self): terminals, self.callback = _create_unless(self.terminals, self.g_regex_flags, self.re, self.use_bytes) @@ -614,9 +625,10 @@ def _build_scanner(self): self._scanner = Scanner(terminals, self.g_regex_flags, self.re, self.use_bytes) @property - def scanner(self): + def scanner(self) -> Scanner: if self._scanner is None: self._build_scanner() + assert self._scanner is not None return self._scanner def match(self, text, pos, *, end_pos): @@ -656,6 +668,19 @@ def next_token(self, lex_state: LexerState, parser_state: Any = None) -> Token: # EOF raise EOFError(self) + def search_start(self, text: str, start_state, start_pos: int, end_pos: int) -> Optional[Token]: + while True: + res = self.scanner.search(text, start_pos, end_pos) + if not res: + return None + (value, type_), actual_pos = res + if type_ in self.ignore_types: + start_pos = actual_pos + len(value) + continue + t = Token(type_, value, actual_pos, end_pos=start_pos + len(value)) + return t + + class ContextualLexer(Lexer): lexers: Dict[int, AbstractBasicLexer] @@ -710,4 +735,8 @@ def lex(self, lexer_state: LexerState, parser_state: 'ParserState') -> Iterator[ except UnexpectedCharacters: raise e # Raise the original UnexpectedCharacters. The root lexer raises it with the wrong expected set. + def search_start(self, text: str, start_state, start_pos: int, end_pos: int) -> Optional[Token]: + return self.lexers[start_state].search_start(text, start_state, start_pos, end_pos) + + ###} diff --git a/lark/parser_frontends.py b/lark/parser_frontends.py index bb88d3c2..0712ccfa 100644 --- a/lark/parser_frontends.py +++ b/lark/parser_frontends.py @@ -127,6 +127,60 @@ def parse_interactive(self, text: Optional[str]=None, start=None, return self.parser.parse_interactive(stream, chosen_start) + def scan(self, text: str, start: Optional[str]=None, *, start_pos: Optional[int] = None, + end_pos: Optional[int] = None): + """ + In contrast to the other functions here, this one actually does work. See `Lark.scan` + for a description of what this function is for. + """ + if self.options.parser != 'lalr': + raise ValueError("scan requires parser='lalr' and lexer='contextual'") + start_states = self.parser._parse_table.start_states + chosen_start = self._verify_start(start) + start_state = start_states[chosen_start] + pos = start_pos if start_pos is not None else 0 + end_pos = end_pos if end_pos is not None else len(text) + if pos < 0: + pos += len(text) + if end_pos < 0: + pos += len(text) + del start_pos + while True: + # Find the next candidate location + found = self.lexer.search_start(text, start_state, pos, end_pos) + # No more valid candidates + if found is None: + break + assert found.end_pos <= end_pos + # Collect the potential end points found for this parse + # We need to keep track of multiple options in case there are false `$END`s in the `ip.choices()` + # We don't want to check early since this can be expensive. + valid_end = [] + ip = self.parse_interactive(text, start=chosen_start, start_pos=found.start_pos, end_pos=end_pos) + tokens = ip.lexer_thread.lex(ip.parser_state) + while True: + try: + token = next(tokens) + ip.feed_token(token) + except (UnexpectedInput, StopIteration): + # Either we couldn't parse the characters or the resulting token wasn't valid. + # Either way, stop + break + if '$END' in ip.choices(): + valid_end.append((token, ip.copy())) + # Check through all potential ending points and see if passing in `$END` actually works + for (last, pot) in valid_end[::-1]: + try: + res = pot.feed_eof(last) + except UnexpectedInput: + continue + else: + yield ((found.start_pos, last.end_pos), res) + pos = last.end_pos + break + else: + pos = found.start_pos + 1 + def _validate_frontend_args(parser, lexer) -> None: assert_config(parser, ('lalr', 'earley', 'cyk')) if not isinstance(lexer, type): # not custom lexer? @@ -146,7 +200,7 @@ def _get_lexer_callbacks(transformer, terminals): result[terminal.name] = callback return result -class PostLexConnector: +class PostLexConnector(Lexer): def __init__(self, lexer, postlexer): self.lexer = lexer self.postlexer = postlexer diff --git a/tests/__main__.py b/tests/__main__.py index c5298a77..bd53fe8d 100644 --- a/tests/__main__.py +++ b/tests/__main__.py @@ -14,6 +14,7 @@ from .test_lexer import TestLexer from .test_python_grammar import TestPythonParser from .test_tree_templates import * # We define __all__ to list which TestSuites to run +from .test_scan import TestScan try: from .test_nearley.test_nearley import TestNearley diff --git a/tests/test_scan.py b/tests/test_scan.py new file mode 100644 index 00000000..ecb224f3 --- /dev/null +++ b/tests/test_scan.py @@ -0,0 +1,74 @@ +from __future__ import annotations + +import unittest + +from lark import Lark, Tree + + +class TestScan(unittest.TestCase): + def test_scan(self): + parser = Lark(r""" + expr: "(" (WORD|expr)* ")" + %ignore / +/ + WORD: /\w+/ + """, parser='lalr', start="expr") + + text = "|() | (a) | ((//)) | (c ((d))) |" + finds = list(parser.scan(text)) + self.assertEqual(finds, [((1, 3), Tree('expr', [])), + ((6, 9), Tree('expr', ['a'])), + ((21, 30), Tree('expr', ['c', Tree('expr', [Tree('expr', ['d'])])])), + ]) + + def test_scan_meta(self): + parser = Lark(r""" + expr: "(" (WORD|expr)* ")" + %ignore /\s+/ + WORD: /\w+/ + """, parser='lalr', start="expr", propagate_positions=True) + + text = " (a)\n(b)\n (\n)" + finds = list(parser.scan(text)) + self.assertEqual(finds, [((1, 4), Tree('expr', ['a'])), + ((5, 8), Tree('expr', ['b'])), + ((10, 13), Tree('expr', []))]) + + self.assertEqual(1, finds[0][1].meta.start_pos) + self.assertEqual(4, finds[0][1].meta.end_pos) + self.assertEqual(1, finds[0][1].meta.line) + self.assertEqual(1, finds[0][1].meta.end_line) + self.assertEqual(2, finds[0][1].meta.column) + self.assertEqual(5, finds[0][1].meta.end_column) + + self.assertEqual(5, finds[1][1].meta.start_pos) + self.assertEqual(8, finds[1][1].meta.end_pos) + self.assertEqual(2, finds[1][1].meta.line) + self.assertEqual(2, finds[1][1].meta.end_line) + self.assertEqual(1, finds[1][1].meta.column) + self.assertEqual(4, finds[1][1].meta.end_column) + + self.assertEqual(10, finds[2][1].meta.start_pos) + self.assertEqual(13, finds[2][1].meta.end_pos) + self.assertEqual(3, finds[2][1].meta.line) + self.assertEqual(4, finds[2][1].meta.end_line) + self.assertEqual(2, finds[2][1].meta.column) + self.assertEqual(2, finds[2][1].meta.end_column) + + def test_scan_backtrack(self): + """ Tests that the scan function properly backtracks if it finds partial, but incorrect parses""" + + parser = Lark(r""" + start: expr+ + expr: "(" (WORD|expr)* ")" + %ignore /\s+/ + WORD: /\w+/ + """, parser='lalr', start="start") + + text = "(a)(b) || (c)(d(e) || (f)" + finds = list(parser.scan(text)) + self.assertEqual(finds, [ + ((0, 6), Tree('start', [Tree('expr', ['a']), Tree('expr', ['b'])])), + ((10, 13), Tree('start', [Tree('expr', ['c'])])), + ((15, 18), Tree('start', [Tree('expr', ['e'])])), + ((22, 25), Tree('start', [Tree('expr', ['f'])])), + ]) From ca0cd554bc612b3323745a200bfde8d58a2d6cf7 Mon Sep 17 00:00:00 2001 From: MegaIng Date: Thu, 20 Jun 2024 03:19:43 +0200 Subject: [PATCH 4/8] Small fixes --- tests/test_parser.py | 3 ++- tests/test_scan.py | 2 -- 2 files changed, 2 insertions(+), 3 deletions(-) diff --git a/tests/test_parser.py b/tests/test_parser.py index 9f3c80b9..a4627134 100644 --- a/tests/test_parser.py +++ b/tests/test_parser.py @@ -2585,7 +2585,8 @@ def test_strict(self): """ self.assertRaises(GrammarError, _Lark, grammar, strict=True) - @unittest.skipIf(LEXER in ('dynamic', 'custom_old'), "start_pos and end_pos not compatible with old style custom/dynamic lexer ") + @unittest.skipIf(LEXER in ('dynamic', 'dynamic_complete', 'custom_old'), + "start_pos and end_pos not compatible with old style custom/dynamic lexer ") def test_subset_parse(self): grammar = r""" start: (WORD|FRAG_END|FRAG_START)+ diff --git a/tests/test_scan.py b/tests/test_scan.py index ecb224f3..830c1d1f 100644 --- a/tests/test_scan.py +++ b/tests/test_scan.py @@ -1,5 +1,3 @@ -from __future__ import annotations - import unittest from lark import Lark, Tree From 884d18b0a4ab727bff51822b7e255af8789c4118 Mon Sep 17 00:00:00 2001 From: MegaIng Date: Thu, 20 Jun 2024 14:58:59 +0200 Subject: [PATCH 5/8] Address review comments --- lark/lark.py | 4 ++-- lark/lexer.py | 12 ++++++------ lark/parser_frontends.py | 17 +++++++++++------ lark/tools/standalone.py | 2 +- tests/test_parser.py | 2 ++ tests/test_scan.py | 35 +++++++++++++++++++++++++++++++++++ 6 files changed, 57 insertions(+), 15 deletions(-) diff --git a/lark/lark.py b/lark/lark.py index 9d897ffb..8b2ba356 100644 --- a/lark/lark.py +++ b/lark/lark.py @@ -16,7 +16,7 @@ from typing import Literal else: from typing_extensions import Literal - from .parser_frontends import ParsingFrontend + from .parser_frontends import ParsingFrontend, ScanMatch from .exceptions import ConfigurationError, assert_config, UnexpectedInput from .utils import Serialize, SerializeMemoizer, FS, isascii, logger @@ -661,7 +661,7 @@ def parse(self, text: str, start: Optional[str] = None, return self.parser.parse(text, start=start, on_error=on_error, start_pos=start_pos, end_pos=end_pos) def scan(self, text: str, start: Optional[str] = None, *, start_pos: Optional[int] = None, - end_pos: Optional[int] = None) -> Iterator[Tuple[Tuple[int, int], 'ParseTree']]: + end_pos: Optional[int] = None) -> Iterable['ScanMatch']: """ Scans the input text for non-overlapping matches of the rule specified by 'start' and yields the start and end position as well as the resulting tree. diff --git a/lark/lexer.py b/lark/lexer.py index e4aa28e2..4a574b70 100644 --- a/lark/lexer.py +++ b/lark/lexer.py @@ -404,17 +404,17 @@ def match(self, text, pos, *, end_pos=sys.maxsize): return m.group(0), m.lastgroup def search(self, text, start_pos, end_pos): - best = None, float("inf") + best = None for mre in self._mres: mre: re.Pattern m = mre.search(text, start_pos, end_pos) if m: - if m.start() < best[1]: - best = (m.group(0), m.lastgroup), m.start() - if best[0] is None: - return None - else: + if best is None or m.start() < best.start(): + best = m + if best is None: return best + else: + return (best.group(0), best.lastgroup), best.start() def _regexp_has_newline(r: str): diff --git a/lark/parser_frontends.py b/lark/parser_frontends.py index 0712ccfa..fd582e1a 100644 --- a/lark/parser_frontends.py +++ b/lark/parser_frontends.py @@ -1,4 +1,4 @@ -from typing import Any, Callable, Dict, Optional, Collection, Union, TYPE_CHECKING +from typing import Any, Callable, Dict, Optional, Collection, Union, TYPE_CHECKING, NamedTuple, Iterable, Tuple from .exceptions import ConfigurationError, GrammarError, assert_config, UnexpectedInput from .utils import get_regexp_width, Serialize @@ -14,6 +14,12 @@ ###{standalone + +class ScanMatch(NamedTuple): + range: Tuple[int, int] + tree: Tree + + def _wrap_lexer(lexer_class): future_interface = getattr(lexer_class, '__future_interface__', False) if future_interface: @@ -128,13 +134,13 @@ def parse_interactive(self, text: Optional[str]=None, start=None, def scan(self, text: str, start: Optional[str]=None, *, start_pos: Optional[int] = None, - end_pos: Optional[int] = None): + end_pos: Optional[int] = None) -> Iterable[ScanMatch]: """ In contrast to the other functions here, this one actually does work. See `Lark.scan` for a description of what this function is for. """ if self.options.parser != 'lalr': - raise ValueError("scan requires parser='lalr' and lexer='contextual'") + raise ValueError("scan requires parser='lalr'") start_states = self.parser._parse_table.start_states chosen_start = self._verify_start(start) start_state = start_states[chosen_start] @@ -143,8 +149,7 @@ def scan(self, text: str, start: Optional[str]=None, *, start_pos: Optional[int] if pos < 0: pos += len(text) if end_pos < 0: - pos += len(text) - del start_pos + end_pos += len(text) while True: # Find the next candidate location found = self.lexer.search_start(text, start_state, pos, end_pos) @@ -175,7 +180,7 @@ def scan(self, text: str, start: Optional[str]=None, *, start_pos: Optional[int] except UnexpectedInput: continue else: - yield ((found.start_pos, last.end_pos), res) + yield ScanMatch((found.start_pos, last.end_pos), res) pos = last.end_pos break else: diff --git a/lark/tools/standalone.py b/lark/tools/standalone.py index 9940ccbf..92b9cf9a 100644 --- a/lark/tools/standalone.py +++ b/lark/tools/standalone.py @@ -30,7 +30,7 @@ from typing import ( TypeVar, Generic, Type, Tuple, List, Dict, Iterator, Collection, Callable, Optional, FrozenSet, Any, Union, Iterable, IO, TYPE_CHECKING, overload, Sequence, - Pattern as REPattern, ClassVar, Set, Mapping + Pattern as REPattern, ClassVar, Set, Mapping, NamedTuple ) ###} diff --git a/tests/test_parser.py b/tests/test_parser.py index a4627134..c0437830 100644 --- a/tests/test_parser.py +++ b/tests/test_parser.py @@ -2599,6 +2599,8 @@ def test_subset_parse(self): parser = _Lark(grammar) self.assertEqual(parser.parse(" abc def ", start_pos=1, end_pos=-1), Tree('start', [Token('WORD', 'abc'), Token('WORD', 'def')])) + self.assertEqual(parser.parse(" abc def ", start_pos=1-9, end_pos=-1+9), + Tree('start', [Token('WORD', 'abc'), Token('WORD', 'def')])) self.assertEqual(parser.parse("xabc def ", start_pos=1, end_pos=-1), Tree('start', [Token('FRAG_END', 'abc'), Token('WORD', 'def')])) diff --git a/tests/test_scan.py b/tests/test_scan.py index 830c1d1f..cbfaf5de 100644 --- a/tests/test_scan.py +++ b/tests/test_scan.py @@ -18,6 +18,20 @@ def test_scan(self): ((21, 30), Tree('expr', ['c', Tree('expr', [Tree('expr', ['d'])])])), ]) + def test_scan_basic_lexer(self): + parser = Lark(r""" + expr: "(" (WORD|expr)* ")" + %ignore / +/ + WORD: /\w+/ + """, parser='lalr', start="expr", lexer='basic') + + text = "|() | (a) | ((//)) | (c ((d))) |" + finds = list(parser.scan(text)) + self.assertEqual(finds, [((1, 3), Tree('expr', [])), + ((6, 9), Tree('expr', ['a'])), + ((21, 30), Tree('expr', ['c', Tree('expr', [Tree('expr', ['d'])])])), + ]) + def test_scan_meta(self): parser = Lark(r""" expr: "(" (WORD|expr)* ")" @@ -70,3 +84,24 @@ def test_scan_backtrack(self): ((15, 18), Tree('start', [Tree('expr', ['e'])])), ((22, 25), Tree('start', [Tree('expr', ['f'])])), ]) + + def test_scan_subset(self): + parser = Lark(r""" + expr: "(" (WORD|expr)* ")" + %ignore /\s+/ + WORD: /\w+/ + """, parser='lalr', start="expr", propagate_positions=True) + + text = "()\n()(a)\n(b)\n (\n) | \n(\n)" + finds = list(parser.scan(text, start_pos=5, end_pos=-1)) + self.assertEqual(finds, [((5, 8), Tree('expr', ['a'])), + ((9, 12), Tree('expr', ['b'])), + ((14, 17), Tree('expr', []))]) + self.assertEqual(2, finds[0][1].meta.line) + + text = "()\n()(a)\n(b)\n (\n) | \n(\n)" + finds = list(parser.scan(text, start_pos=5-len(text), end_pos=-1+len(text))) + self.assertEqual(finds, [((5, 8), Tree('expr', ['a'])), + ((9, 12), Tree('expr', ['b'])), + ((14, 17), Tree('expr', []))]) + self.assertEqual(2, finds[0][1].meta.line) From d0d9fcc05fd9b54df6132e71677489d528079e2c Mon Sep 17 00:00:00 2001 From: MegaIng Date: Thu, 20 Jun 2024 20:30:36 +0200 Subject: [PATCH 6/8] Add scan_wikitext example --- examples/advanced/scan_wikitext.py | 42 ++++++++++++++++++++++++++++++ 1 file changed, 42 insertions(+) create mode 100644 examples/advanced/scan_wikitext.py diff --git a/examples/advanced/scan_wikitext.py b/examples/advanced/scan_wikitext.py new file mode 100644 index 00000000..c94e6d7b --- /dev/null +++ b/examples/advanced/scan_wikitext.py @@ -0,0 +1,42 @@ +""" +Showcases how to use `Lark.scan` to select a pattern from a larger text without having to parse all of it. + +Uses `requests` to fetch the current wikitext from `Python (Programming Language)` and uses a simple grammar +to extract all wikitext templates used in the page. + +""" + +from collections import Counter +from pprint import pprint + +import lark +import requests + +page_name = "Python_(programming_language)" +url = f"https://en.wikipedia.org/wiki/{page_name}?action=raw" + +wikitext = requests.get(url).text + +grammar = r""" +template: "{{" TEXT ("|" argument)* "}}" +text: (TEXT|template)+ +argument: /\w+(?==)/ "=" text -> named_argument + | text -> numbered_argument + +TEXT: / (?:[^{}|] + | \{(?!\{) + | \}(?!\}) + )+/x +""" +parser = lark.Lark(grammar, parser='lalr', start='template') +used_templates = Counter() +inner_templates = 0 +for (start, end), res in parser.scan(wikitext): + for temp in res.find_data('template'): + if temp != res: + inner_templates += 1 + used_templates[temp.children[0].value] += 1 + +pprint(used_templates) +print("Total templates used:", used_templates.total()) +print("Number of templates nested inside others:", inner_templates) From 04c2bf638cc36f17ae1e6850345fdae1fd844a4f Mon Sep 17 00:00:00 2001 From: MegaIng Date: Thu, 20 Jun 2024 20:47:29 +0200 Subject: [PATCH 7/8] Improve docs --- lark/lark.py | 27 ++++++++++++++++++++++++--- 1 file changed, 24 insertions(+), 3 deletions(-) diff --git a/lark/lark.py b/lark/lark.py index 8b2ba356..96f082d8 100644 --- a/lark/lark.py +++ b/lark/lark.py @@ -630,6 +630,8 @@ def parse_interactive(self, text: Optional[str] = None, start: Optional[str] = N Parameters: text (str, optional): Text to be parsed. Required for ``resume_parse()``. start (str, optional): Start symbol + start_pos (int, optional): Position at which the parser starts. Defaults to 0. + end_pos (int, optional): Position at which the parser stops. Defaults to len(text). Returns: A new InteractiveParser instance. @@ -648,6 +650,13 @@ def parse(self, text: str, start: Optional[str] = None, start (str, optional): Required if Lark was given multiple possible start symbols (using the start option). on_error (function, optional): if provided, will be called on UnexpectedToken error. Return true to resume parsing. LALR only. See examples/advanced/error_handling.py for an example of how to use on_error. + start_pos (int, optional): Position at which the parser starts. Defaults to 0. + end_pos (int, optional): Position at which the parser stops. Defaults to len(text). + Both of these don't work with lexer='dynamic'/'dynamic_complete' + Their behavior mirrors the behavior of the corresponding parameters in the Standard Library re module, + which most notably means that look behinds in regex will look behind start_pos, but lookaheads + won't look after end_pos. See [re.search](https://docs.python.org/3/library/re.html#re.Pattern.search) + for more information Returns: If a transformer is supplied to ``__init__``, returns whatever is the @@ -663,14 +672,26 @@ def parse(self, text: str, start: Optional[str] = None, def scan(self, text: str, start: Optional[str] = None, *, start_pos: Optional[int] = None, end_pos: Optional[int] = None) -> Iterable['ScanMatch']: """ - Scans the input text for non-overlapping matches of the rule specified by 'start' and - yields the start and end position as well as the resulting tree. + Scans the input text for non-overlapping matches of this grammar. - Only works with parser='lalr' and lexer='contextual'. Works best if the first terminal(s) + Only works with parser='lalr'. Works best if the first terminal(s) that can be matched by grammar are unique in the text and always indicate the start of a match. + A found match will never start or end with an ignored terminal. + Does not raise any exceptions except for invalid arguments/configurations. + Parameters: + text (str, optional): Text to be parsed. Required for ``resume_parse()``. + start (str, optional): Start symbol + start_pos (int, optional): Position at which the parser starts. Defaults to 0. + end_pos (int, optional): Position at which the parser stops. Defaults to len(text). + + Returns: + An Iterable of `ScanMatch` instances, which contain two attributes: `range` a tuple with + the indices of the start and end of the found match, and `tree`, the parsed Tree object. + + See Also: ``Lark.parse()`` """ return self.parser.scan(text, start=start, start_pos=start_pos, end_pos=end_pos) From d45cbf0d46526af67ea1264ab6fef7c96495f2e7 Mon Sep 17 00:00:00 2001 From: MegaIng Date: Fri, 21 Jun 2024 15:15:30 +0200 Subject: [PATCH 8/8] temp --- lark/__init__.py | 3 +- lark/lark.py | 20 ++-- lark/lexer.py | 136 +++++++++++++++----------- lark/parser_frontends.py | 70 +++++++------ lark/parsers/lalr_parser.py | 2 +- lark/tools/nearley.py | 2 +- lark/tools/standalone.py | 3 +- pyproject.toml | 2 +- tests/test_lexer.py | 6 +- tests/test_parser.py | 10 +- tests/test_scan.py | 6 +- tests/test_tree_forest_transformer.py | 6 +- tests/test_tree_templates.py | 13 ++- 13 files changed, 155 insertions(+), 124 deletions(-) diff --git a/lark/__init__.py b/lark/__init__.py index a13c7b3b..e819f99d 100644 --- a/lark/__init__.py +++ b/lark/__init__.py @@ -9,7 +9,7 @@ UnexpectedToken, ) from .lark import Lark -from .lexer import Token +from .lexer import Token, TextSlice from .tree import ParseTree, Tree from .utils import logger from .visitors import Discard, Transformer, Transformer_NonRecursive, Visitor, v_args @@ -27,6 +27,7 @@ "UnexpectedToken", "Lark", "Token", + "TextSlice", "ParseTree", "Tree", "logger", diff --git a/lark/lark.py b/lark/lark.py index 96f082d8..85634c7d 100644 --- a/lark/lark.py +++ b/lark/lark.py @@ -24,7 +24,7 @@ from .tree import Tree from .common import LexerConf, ParserConf, _ParserArgType, _LexerArgType -from .lexer import Lexer, BasicLexer, TerminalDef, LexerThread, Token +from .lexer import Lexer, BasicLexer, TerminalDef, LexerThread, Token, TextSlice from .parse_tree_builder import ParseTreeBuilder from .parser_frontends import _validate_frontend_args, _get_lexer_callbacks, _deserialize_parsing_frontend, _construct_parsing_frontend from .grammar import Rule @@ -600,8 +600,7 @@ def open_from_package(cls: Type[_T], package: str, grammar_path: str, search_pat def __repr__(self): return 'Lark(open(%r), parser=%r, lexer=%r, ...)' % (self.source_path, self.options.parser, self.options.lexer) - def lex(self, text: str, dont_ignore: bool = False, *, start_pos: Optional[int] = None, - end_pos: Optional[int] = None) -> Iterator[Token]: + def lex(self, text: Union[str, 'TextSlice'], dont_ignore: bool = False) -> Iterator[Token]: """Only lex (and postlex) the text, without parsing it. Only relevant when lexer='basic' When dont_ignore=True, the lexer will return all tokens, even those marked for %ignore. @@ -613,7 +612,8 @@ def lex(self, text: str, dont_ignore: bool = False, *, start_pos: Optional[int] lexer = self._build_lexer(dont_ignore) else: lexer = self.lexer - lexer_thread = LexerThread.from_text(lexer, text, start_pos=start_pos, end_pos=end_pos) + text = TextSlice.from_text(text) + lexer_thread = LexerThread.from_text(lexer, text) stream = lexer_thread.lex(None) if self.options.postlex: return self.options.postlex.process(stream) @@ -640,9 +640,8 @@ def parse_interactive(self, text: Optional[str] = None, start: Optional[str] = N """ return self.parser.parse_interactive(text, start=start, start_pos=start_pos, end_pos=end_pos) - def parse(self, text: str, start: Optional[str] = None, - on_error: 'Optional[Callable[[UnexpectedInput], bool]]' = None, - *, start_pos: Optional[int] = None, end_pos: Optional[int] = None) -> 'ParseTree': + def parse(self, text: Union[str, 'TextSlice'], start: Optional[str] = None, + on_error: 'Optional[Callable[[UnexpectedInput], bool]]' = None) -> 'ParseTree': """Parse the given text, according to the options provided. Parameters: @@ -667,10 +666,9 @@ def parse(self, text: str, start: Optional[str] = None, For convenience, these sub-exceptions also inherit from ``ParserError`` and ``LexerError``. """ - return self.parser.parse(text, start=start, on_error=on_error, start_pos=start_pos, end_pos=end_pos) + return self.parser.parse(text, start=start, on_error=on_error) - def scan(self, text: str, start: Optional[str] = None, *, start_pos: Optional[int] = None, - end_pos: Optional[int] = None) -> Iterable['ScanMatch']: + def scan(self, text: Union[str, TextSlice], start: Optional[str] = None) -> Iterable['ScanMatch']: """ Scans the input text for non-overlapping matches of this grammar. @@ -693,6 +691,6 @@ def scan(self, text: str, start: Optional[str] = None, *, start_pos: Optional[in See Also: ``Lark.parse()`` """ - return self.parser.scan(text, start=start, start_pos=start_pos, end_pos=end_pos) + return self.parser.scan(text, start=start) ###} diff --git a/lark/lexer.py b/lark/lexer.py index 4a574b70..b8333276 100644 --- a/lark/lexer.py +++ b/lark/lexer.py @@ -3,9 +3,10 @@ from abc import abstractmethod, ABC import re from contextlib import suppress +from dataclasses import dataclass from typing import ( TypeVar, Type, Dict, Iterator, Collection, Callable, Optional, FrozenSet, Any, - ClassVar, TYPE_CHECKING, overload, Tuple + ClassVar, TYPE_CHECKING, overload, Tuple, AnyStr, Generic, Union ) from types import ModuleType import warnings @@ -137,6 +138,33 @@ def user_repr(self) -> str: else: return self.name + +@dataclass(frozen=True) +class TextSlice(Generic[AnyStr]): + text: AnyStr + start: int + end: int + + def __post_init__(self): + if self.start < 0: + object.__setattr__(self, 'start', self.start + len(self.text)) + if self.end < 0: + object.__setattr__(self, 'end', self.end + len(self.text)) + + @classmethod + def from_text(cls, text: Union[AnyStr, 'TextSlice[AnyStr]']) -> 'TextSlice[AnyStr]': + if isinstance(text, TextSlice): + return text + else: + return cls(text, 0, len(text)) + + def is_complete_text(self): + return self.start == 0 and self.end == len(self.text) + + def start_from(self, pos: int): + return TextSlice(self.text, pos, self.end) + + _T = TypeVar('_T', bound="Token") class Token(str): @@ -286,7 +314,8 @@ def __init__(self, newline_char): self.line = 1 self.column = 1 self.line_start_pos = 0 - + def __repr__(self): + return f"" def __eq__(self, other): if not isinstance(other, LineCounter): return NotImplemented @@ -322,9 +351,9 @@ def __init__(self, scanner): self.scanner = scanner def __call__(self, t): - res = self.scanner.match(t.value, 0) - if res: - _value, t.type = res + res = self.scanner.fullmatch(t.value) + if res is not None: + t.type = res return t @@ -360,19 +389,18 @@ def _create_unless(terminals, g_regex_flags, re_, use_bytes): if strtok.pattern.flags <= retok.pattern.flags: embedded_strs.add(strtok) if unless: - callback[retok.name] = UnlessCallback(Scanner(unless, g_regex_flags, re_, match_whole=True, use_bytes=use_bytes)) + callback[retok.name] = UnlessCallback(Scanner(unless, g_regex_flags, re_, use_bytes=use_bytes)) new_terminals = [t for t in terminals if t not in embedded_strs] return new_terminals, callback class Scanner: - def __init__(self, terminals, g_regex_flags, re_, use_bytes, match_whole=False): + def __init__(self, terminals, g_regex_flags, re_, use_bytes): self.terminals = terminals self.g_regex_flags = g_regex_flags self.re_ = re_ self.use_bytes = use_bytes - self.match_whole = match_whole self.allowed_types = {t.name for t in self.terminals} @@ -382,10 +410,9 @@ def _build_mres(self, terminals, max_size): # Python sets an unreasonable group limit (currently 100) in its re module # Worse, the only way to know we reached it is by catching an AssertionError! # This function recursively tries less and less groups until it's successful. - postfix = '$' if self.match_whole else '' mres = [] while terminals: - pattern = u'|'.join(u'(?P<%s>%s)' % (t.name, t.pattern.to_regexp() + postfix) for t in terminals[:max_size]) + pattern = u'|'.join(u'(?P<%s>%s)' % (t.name, t.pattern.to_regexp()) for t in terminals[:max_size]) if self.use_bytes: pattern = pattern.encode('latin-1') try: @@ -397,24 +424,30 @@ def _build_mres(self, terminals, max_size): terminals = terminals[max_size:] return mres - def match(self, text, pos, *, end_pos=sys.maxsize): + def fullmatch(self, text: str): for mre in self._mres: - m = mre.match(text, pos, end_pos) + m = mre.fullmatch(text) if m: - return m.group(0), m.lastgroup + return m.lastgroup + - def search(self, text, start_pos, end_pos): - best = None + def match(self, text: TextSlice, pos: int): + assert pos >= text.start for mre in self._mres: - mre: re.Pattern - m = mre.search(text, start_pos, end_pos) + m = mre.match(text.text, pos, text.end) if m: - if best is None or m.start() < best.start(): - best = m - if best is None: - return best - else: - return (best.group(0), best.lastgroup), best.start() + return m.group(0), m.lastgroup + + def search(self, text: TextSlice, pos: int): + results = list(filter(None, [ + mre.search(text.text, pos, text.end) + for mre in self._mres + ])) + if not results: + return None + + best = min(results, key=lambda m: m.start()) + return (best.group(0), best.lastgroup), best.start() def _regexp_has_newline(r: str): @@ -435,39 +468,27 @@ class LexerState: __slots__ = 'text', 'line_ctr', 'end_pos', 'last_token' - text: str + text: TextSlice line_ctr: LineCounter - end_pos: int last_token: Optional[Token] - def __init__(self, text: Optional[str], line_ctr: Optional[LineCounter] = None, last_token: Optional[Token] = None, - *, start_pos: Optional[int] = None, end_pos: Optional[int] = None): + def __init__(self, text: Optional[TextSlice], line_ctr: Optional[LineCounter] = None, last_token: Optional[Token] = None): self.text = text # type: ignore[assignment] self.line_ctr = line_ctr or LineCounter(b'\n' if isinstance(text, bytes) else '\n') + if text is not None and text.start != 0: + self.line_ctr.feed_substring(text.text, 0, text.start) self.last_token = last_token - # If we are not given a text (i.e. via `parse_interactive`), `start_pos` and `end_pos` are ignored - if text is None: - self.end_pos = sys.maxsize - return - self.end_pos = end_pos if end_pos is not None else len(self.text) - if start_pos is not None: - if start_pos < 0: - start_pos += len(text) - self.line_ctr.feed_substring(text, 0, start_pos) - if self.end_pos < 0: - self.end_pos += len(text) def __eq__(self, other): if not isinstance(other, LexerState): return NotImplemented - return (self.text is other.text and + return (self.text == other.text and self.line_ctr == other.line_ctr and - self.end_pos == other.end_pos and - self.last_token == other.last_token) + self.end_pos == other.end_pos) def __copy__(self): - return type(self)(self.text, copy(self.line_ctr), self.last_token, end_pos=self.end_pos) + return type(self)(self.text, copy(self.line_ctr), self.last_token) class LexerThread: @@ -479,9 +500,8 @@ def __init__(self, lexer: 'Lexer', lexer_state: LexerState): self.state = lexer_state @classmethod - def from_text(cls, lexer: 'Lexer', text: str, *, start_pos: Optional[int] = None, - end_pos: Optional[int] = None) -> 'LexerThread': - return cls(lexer, LexerState(text, start_pos=start_pos, end_pos=end_pos)) + def from_text(cls, lexer: 'Lexer', text: TextSlice) -> 'LexerThread': + return cls(lexer, LexerState(text)) def lex(self, parser_state): return self.lexer.lex(self.state, parser_state) @@ -502,9 +522,9 @@ class Lexer(ABC): """ @abstractmethod def lex(self, lexer_state: LexerState, parser_state: Any) -> Iterator[Token]: - return NotImplemented + raise NotImplementedError - def search_start(self, text: str, start_state, start_pos: int, end_pos: int) -> Optional[Token]: + def search_start(self, text: TextSlice, start_state, pos: int) -> Optional[Token]: raise TypeError("This lexer can not be used for searching in text") def make_lexer_state(self, text): @@ -631,18 +651,18 @@ def scanner(self) -> Scanner: assert self._scanner is not None return self._scanner - def match(self, text, pos, *, end_pos): - return self.scanner.match(text, pos, end_pos=end_pos) + def match(self, text: TextSlice, pos: int) -> Optional[Tuple[str, str]]: + return self.scanner.match(text, pos) def next_token(self, lex_state: LexerState, parser_state: Any = None) -> Token: line_ctr = lex_state.line_ctr - while line_ctr.char_pos < lex_state.end_pos: - res = self.match(lex_state.text, line_ctr.char_pos, end_pos=lex_state.end_pos) + while line_ctr.char_pos < lex_state.text.end: + res = self.match(lex_state.text, line_ctr.char_pos) if not res: allowed = self.scanner.allowed_types - self.ignore_types if not allowed: allowed = {""} - raise UnexpectedCharacters(lex_state.text, line_ctr.char_pos, line_ctr.line, line_ctr.column, + raise UnexpectedCharacters(lex_state.text.text, line_ctr.char_pos, line_ctr.line, line_ctr.column, allowed=allowed, token_history=lex_state.last_token and [lex_state.last_token], state=parser_state, terminals_by_name=self.terminals_by_name) @@ -668,16 +688,16 @@ def next_token(self, lex_state: LexerState, parser_state: Any = None) -> Token: # EOF raise EOFError(self) - def search_start(self, text: str, start_state, start_pos: int, end_pos: int) -> Optional[Token]: + def search_start(self, text: TextSlice, start_state, pos: int) -> Optional[Token]: while True: - res = self.scanner.search(text, start_pos, end_pos) + res = self.scanner.search(text, pos) if not res: return None (value, type_), actual_pos = res if type_ in self.ignore_types: - start_pos = actual_pos + len(value) + pos = actual_pos + len(value) continue - t = Token(type_, value, actual_pos, end_pos=start_pos + len(value)) + t = Token(type_, value, actual_pos, end_pos=actual_pos + len(value)) return t @@ -735,8 +755,8 @@ def lex(self, lexer_state: LexerState, parser_state: 'ParserState') -> Iterator[ except UnexpectedCharacters: raise e # Raise the original UnexpectedCharacters. The root lexer raises it with the wrong expected set. - def search_start(self, text: str, start_state, start_pos: int, end_pos: int) -> Optional[Token]: - return self.lexers[start_state].search_start(text, start_state, start_pos, end_pos) + def search_start(self, text: TextSlice, start_state, pos: int) -> Optional[Token]: + return self.lexers[start_state].search_start(text, start_state, pos) ###} diff --git a/lark/parser_frontends.py b/lark/parser_frontends.py index fd582e1a..20c53ab6 100644 --- a/lark/parser_frontends.py +++ b/lark/parser_frontends.py @@ -2,7 +2,7 @@ from .exceptions import ConfigurationError, GrammarError, assert_config, UnexpectedInput from .utils import get_regexp_width, Serialize -from .lexer import LexerThread, BasicLexer, ContextualLexer, Lexer +from .lexer import LexerThread, BasicLexer, ContextualLexer, Lexer, TextSlice from .parsers import earley, xearley, cyk from .parsers.lalr_parser import LALR_Parser from .tree import Tree @@ -21,22 +21,34 @@ class ScanMatch(NamedTuple): def _wrap_lexer(lexer_class): - future_interface = getattr(lexer_class, '__future_interface__', False) - if future_interface: + future_interface = getattr(lexer_class, '__future_interface__', 0) + if future_interface == 2: return lexer_class - else: + elif future_interface == 1: class CustomLexerWrapper(Lexer): def __init__(self, lexer_conf): self.lexer = lexer_class(lexer_conf) def lex(self, lexer_state, parser_state): - if lexer_state.line_ctr.char_pos != 0: - raise TypeError("Old Interface Custom Lexer don't support start_pos") - if lexer_state.end_pos != len(lexer_state.text): - raise TypeError("Old Interface Custom Lexer don't support end_pos") - return self.lexer.lex(lexer_state.text) + if not lexer_state.text.is_complete_text(): + raise TypeError("Interface=1 Custom Lexer don't support TextSlice") + lexer_state.text = lexer_state.text.text + return self.lexer.lex(lexer_state, parser_state) + return CustomLexerWrapper + elif future_interface == 0: + class CustomLexerWrapper(Lexer): + def __init__(self, lexer_conf): + self.lexer = lexer_class(lexer_conf) + def lex(self, lexer_state, parser_state): + if not lexer_state.text.is_complete_text(): + raise TypeError("Interface=0 Custom Lexer don't support TextSlice") + return self.lexer.lex(lexer_state.text.text) return CustomLexerWrapper + else: + raise ValueError(f"Unknown __future_interface__ value {future_interface}, integer 0-2 expected") + + def _deserialize_parsing_frontend(data, memo, lexer_conf, callbacks, options): @@ -105,36 +117,36 @@ def _verify_start(self, start=None): raise ConfigurationError("Unknown start rule %s. Must be one of %r" % (start, self.parser_conf.start)) return start - def _make_lexer_thread(self, text: str, *, start_pos: Optional[int] = None, - end_pos: Optional[int] = None) -> Union[str, LexerThread]: + def _make_lexer_thread(self, text: Union[str, TextSlice]) -> Union[str, LexerThread]: cls = (self.options and self.options._plugins.get('LexerThread')) or LexerThread if self.skip_lexer: - if start_pos is not None or end_pos is not None: - raise TypeError("lexer='dynamic' does not support start_pos/end_pos") + if isinstance(text, TextSlice): + if not text.is_complete_text(): + raise TypeError("lexer='dynamic' does not support TextSlice") + return text.text return text - return cls.from_text(self.lexer, text, start_pos=start_pos, end_pos=end_pos) + text = TextSlice.from_text(text) + return cls.from_text(self.lexer, text) - def parse(self, text: str, start=None, on_error=None, *, start_pos=None, end_pos=None): + def parse(self, text: Union[str, TextSlice], start=None, on_error=None): chosen_start = self._verify_start(start) kw = {} if on_error is None else {'on_error': on_error} - stream = self._make_lexer_thread(text, start_pos=start_pos, end_pos=end_pos) + stream = self._make_lexer_thread(text) return self.parser.parse(stream, chosen_start, **kw) - def parse_interactive(self, text: Optional[str]=None, start=None, - *, start_pos: Optional[int] = None, end_pos: Optional[int] = None): + def parse_interactive(self, text: Union[None, str, TextSlice]=None, start=None): # TODO BREAK - Change text from Optional[str] to text: str = ''. # Would break behavior of exhaust_lexer(), which currently raises TypeError, and after the change would just return [] # When this is done, also adjust the code in `LexerState.__init__` since it currently works around being - # passed `None` with regard to start_pos and end_pos + # passed `None` chosen_start = self._verify_start(start) if self.parser_conf.parser_type != 'lalr': raise ConfigurationError("parse_interactive() currently only works with parser='lalr' ") - stream = self._make_lexer_thread(text, start_pos=start_pos, end_pos=end_pos) # type: ignore[arg-type] + stream = self._make_lexer_thread(text) # type: ignore[arg-type] return self.parser.parse_interactive(stream, chosen_start) - def scan(self, text: str, start: Optional[str]=None, *, start_pos: Optional[int] = None, - end_pos: Optional[int] = None) -> Iterable[ScanMatch]: + def scan(self, text: Union[str, TextSlice], start: Optional[str]=None) -> Iterable[ScanMatch]: """ In contrast to the other functions here, this one actually does work. See `Lark.scan` for a description of what this function is for. @@ -144,24 +156,20 @@ def scan(self, text: str, start: Optional[str]=None, *, start_pos: Optional[int] start_states = self.parser._parse_table.start_states chosen_start = self._verify_start(start) start_state = start_states[chosen_start] - pos = start_pos if start_pos is not None else 0 - end_pos = end_pos if end_pos is not None else len(text) - if pos < 0: - pos += len(text) - if end_pos < 0: - end_pos += len(text) + text: TextSlice = TextSlice.from_text(text) # ignore[no-redef] + pos = text.start while True: # Find the next candidate location - found = self.lexer.search_start(text, start_state, pos, end_pos) + found = self.lexer.search_start(text, start_state, pos) # No more valid candidates if found is None: break - assert found.end_pos <= end_pos + assert found.end_pos <= text.end # Collect the potential end points found for this parse # We need to keep track of multiple options in case there are false `$END`s in the `ip.choices()` # We don't want to check early since this can be expensive. valid_end = [] - ip = self.parse_interactive(text, start=chosen_start, start_pos=found.start_pos, end_pos=end_pos) + ip = self.parse_interactive(text.start_from(pos), start=chosen_start) tokens = ip.lexer_thread.lex(ip.parser_state) while True: try: diff --git a/lark/parsers/lalr_parser.py b/lark/parsers/lalr_parser.py index 6ae2a04f..728753c4 100644 --- a/lark/parsers/lalr_parser.py +++ b/lark/parsers/lalr_parser.py @@ -55,7 +55,7 @@ def parse(self, lexer, start, on_error=None): if isinstance(e, UnexpectedCharacters): # If user didn't change the character position, then we should if p == s.line_ctr.char_pos: - s.line_ctr.feed(s.text[p:p+1]) + s.line_ctr.feed(s.text.text[p:p+1]) try: return e.interactive_parser.resume_parse() diff --git a/lark/tools/nearley.py b/lark/tools/nearley.py index 1fc27d56..95569639 100644 --- a/lark/tools/nearley.py +++ b/lark/tools/nearley.py @@ -44,7 +44,7 @@ """ -nearley_grammar_parser = Lark(nearley_grammar, parser='earley', lexer='basic') +# nearley_grammar_parser = Lark(nearley_grammar, parser='earley', lexer='basic') def _get_rulename(name): name = {'_': '_ws_maybe', '__': '_ws'}.get(name, name) diff --git a/lark/tools/standalone.py b/lark/tools/standalone.py index 92b9cf9a..78a548d7 100644 --- a/lark/tools/standalone.py +++ b/lark/tools/standalone.py @@ -30,8 +30,9 @@ from typing import ( TypeVar, Generic, Type, Tuple, List, Dict, Iterator, Collection, Callable, Optional, FrozenSet, Any, Union, Iterable, IO, TYPE_CHECKING, overload, Sequence, - Pattern as REPattern, ClassVar, Set, Mapping, NamedTuple + Pattern as REPattern, ClassVar, Set, Mapping, NamedTuple, AnyStr ) +from dataclasses import dataclass ###} import sys diff --git a/pyproject.toml b/pyproject.toml index 8e40e13d..62480606 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -76,7 +76,7 @@ version = {attr = "lark.__version__"} [tool.mypy] files = "lark" -python_version = "3.6" +python_version = "3.8" show_error_codes = true enable_error_code = ["ignore-without-code"] exclude = [ diff --git a/tests/test_lexer.py b/tests/test_lexer.py index 9dbbfb69..cf9dcc48 100644 --- a/tests/test_lexer.py +++ b/tests/test_lexer.py @@ -1,6 +1,6 @@ from unittest import TestCase, main -from lark import Lark, Tree +from lark import Lark, Tree, TextSlice class TestLexer(TestCase): @@ -25,10 +25,10 @@ def test_subset_lex(self): %ignore " " """) - res = list(p.lex("xxxabc cba ddxx", start_pos=3, end_pos=-2)) + res = list(p.lex(TextSlice("xxxabc cba ddxx", 3, -2))) assert res == list('abccbadd') - res = list(p.lex("aaaabc cba dddd", start_pos=3, end_pos=-2)) + res = list(p.lex(TextSlice("aaaabc cba dddd", 3, -2))) assert res == list('abccbadd') diff --git a/tests/test_parser.py b/tests/test_parser.py index c0437830..5e79c38f 100644 --- a/tests/test_parser.py +++ b/tests/test_parser.py @@ -30,7 +30,7 @@ from lark.exceptions import GrammarError, ParseError, UnexpectedToken, UnexpectedInput, UnexpectedCharacters from lark.tree import Tree from lark.visitors import Transformer, Transformer_InPlace, v_args, Transformer_InPlaceRecursive -from lark.lexer import Lexer, BasicLexer +from lark.lexer import Lexer, BasicLexer, TextSlice from lark.indenter import Indenter __all__ = ['TestParsers'] @@ -2597,16 +2597,16 @@ def test_subset_parse(self): """ parser = _Lark(grammar) - self.assertEqual(parser.parse(" abc def ", start_pos=1, end_pos=-1), + self.assertEqual(parser.parse(TextSlice(" abc def ", 1, -1)), Tree('start', [Token('WORD', 'abc'), Token('WORD', 'def')])) - self.assertEqual(parser.parse(" abc def ", start_pos=1-9, end_pos=-1+9), + self.assertEqual(parser.parse(TextSlice(" abc def ", 1-9, -1+9)), Tree('start', [Token('WORD', 'abc'), Token('WORD', 'def')])) - self.assertEqual(parser.parse("xabc def ", start_pos=1, end_pos=-1), + self.assertEqual(parser.parse(TextSlice("xabc def ", 1, -1)), Tree('start', [Token('FRAG_END', 'abc'), Token('WORD', 'def')])) # We match the behavior of python's re module here: It doesn't look ahead beyond `end_pos`, # despite looking behind before `start_pos` - self.assertEqual(parser.parse(" abc defx", start_pos=1, end_pos=-1), + self.assertEqual(parser.parse(TextSlice(" abc defx", 1, -1)), Tree('start', [Token('WORD', 'abc'), Token('WORD', 'def')])) diff --git a/tests/test_scan.py b/tests/test_scan.py index cbfaf5de..53ccc37a 100644 --- a/tests/test_scan.py +++ b/tests/test_scan.py @@ -1,6 +1,6 @@ import unittest -from lark import Lark, Tree +from lark import Lark, Tree, TextSlice class TestScan(unittest.TestCase): @@ -93,14 +93,14 @@ def test_scan_subset(self): """, parser='lalr', start="expr", propagate_positions=True) text = "()\n()(a)\n(b)\n (\n) | \n(\n)" - finds = list(parser.scan(text, start_pos=5, end_pos=-1)) + finds = list(parser.scan(TextSlice(text, 5, -1))) self.assertEqual(finds, [((5, 8), Tree('expr', ['a'])), ((9, 12), Tree('expr', ['b'])), ((14, 17), Tree('expr', []))]) self.assertEqual(2, finds[0][1].meta.line) text = "()\n()(a)\n(b)\n (\n) | \n(\n)" - finds = list(parser.scan(text, start_pos=5-len(text), end_pos=-1+len(text))) + finds = list(parser.scan(TextSlice(text, 5-len(text), -1+len(text)))) self.assertEqual(finds, [((5, 8), Tree('expr', ['a'])), ((9, 12), Tree('expr', ['b'])), ((14, 17), Tree('expr', []))]) diff --git a/tests/test_tree_forest_transformer.py b/tests/test_tree_forest_transformer.py index e9600735..f7ac2276 100644 --- a/tests/test_tree_forest_transformer.py +++ b/tests/test_tree_forest_transformer.py @@ -16,9 +16,9 @@ class TestTreeForestTransformer(unittest.TestCase): !bc: "B"? "C"? !cd: "C"? "D" """ - - parser = Lark(grammar, parser='earley', ambiguity='forest') - forest = parser.parse("ABCD") + def setUp(self): + self.parser = Lark(self.grammar, parser='earley', ambiguity='forest') + self.forest = self.parser.parse("ABCD") def test_identity_resolve_ambiguity(self): l = Lark(self.grammar, parser='earley', ambiguity='resolve') diff --git a/tests/test_tree_templates.py b/tests/test_tree_templates.py index ae3c3e07..dd452eac 100644 --- a/tests/test_tree_templates.py +++ b/tests/test_tree_templates.py @@ -35,7 +35,8 @@ class TestTreeTemplatesConf(unittest.TestCase): - parser = Lark(SOME_TEMPLATING_GRAMMAR) + def setUp(self): + self.parser = Lark(SOME_TEMPLATING_GRAMMAR) def test_conf_test_var__not_var(self): conf = TemplateConf(self.parser.parse) @@ -95,8 +96,9 @@ def test_template_match__only_tree(self): class TestTreeTemplatesTemplate(unittest.TestCase): - parser = Lark(SOME_TEMPLATING_GRAMMAR) - conf = TemplateConf(parser.parse) + def setUp(self): + self.parser = Lark(SOME_TEMPLATING_GRAMMAR) + self.conf = TemplateConf(self.parser.parse) def test_template_match__same_tree_no_template__empty_dictionary(self): template = Template(SOME_NON_TEMPLATE_TREE, conf=self.conf) @@ -193,8 +195,9 @@ def test_template_apply_vars__matching_vars__template_replaced(self): class TestTreeTemplatesTemplateTranslator(unittest.TestCase): - parser = Lark(SOME_TEMPLATING_GRAMMAR) - conf = TemplateConf(parser.parse) + def setUp(self): + self.parser = Lark(SOME_TEMPLATING_GRAMMAR) + self.conf = TemplateConf(self.parser.parse) def test_translate__empty_translations__same_tree(self): # no translations to match, so doesn't replace anything & can't error