From db1a5a5430066556d8a08c2a1862f945ccae29fa Mon Sep 17 00:00:00 2001 From: Ross Patterson Date: Wed, 31 Jan 2024 20:34:15 -0500 Subject: [PATCH 01/21] Make lark.lark parse the same grammar as load_grammar.py, and make grammar.md document it more fully. --- docs/grammar.md | 66 ++++++++++---- docs/tree_construction.md | 1 + lark/grammars/lark.lark | 52 +++++++---- tests/test_grammar_formal.py | 169 +++++++++++++++++++++++++++++++++++ tests/test_lark_lark.py | 165 ++++++++++++++++++++++++++++++++++ 5 files changed, 422 insertions(+), 31 deletions(-) create mode 100644 tests/test_grammar_formal.py create mode 100644 tests/test_lark_lark.py diff --git a/docs/grammar.md b/docs/grammar.md index 6cf802403..a005f3143 100644 --- a/docs/grammar.md +++ b/docs/grammar.md @@ -59,26 +59,37 @@ Terminals are used to match text into symbols. They can be defined as a combinat **Syntax:** ```html - [. ] : + [. ] : ``` -Terminal names must be uppercase. +Terminal names must be uppercase. They must start with an underscore (`_`) or a letter (`A` through `Z`), and may be composed of letters, underscores, and digits (`0` through `9`). Terminal names that start with "_" will not be included in the parse tree, unless the `keep_all_tokens` option is specified. Literals can be one of: -* `"string"` -* `/regular expression+/` -* `"case-insensitive string"i` -* `/re with flags/imulx` -* Literal range: `"a".."z"`, `"1".."9"`, etc. +* Literal range: `"a".."z"`, `"1".."9"`, etc. - Each literal must be a single character, and the range represends all values between the two literals, inclusively. -Terminals also support grammar operators, such as `|`, `+`, `*` and `?`. +Each item is one of: + +* `TERMINAL` - Another terminal, which cannot be defined in terms of this terminal. +* `"string literal"` - Literal, to be matched as-is. +* `"string literal"i` - Literal, to be matched case-insensitively. +* `/regexp literal/` - Regular expression literal. Can inclde flags. +* `"character".."character"` - Literal range. The range represends all values between the two literals, inclusively. +* `(item item ..)` - Group items +* `(item | item | ..)` - Alternate items. +* `[item item ..]` - Maybe. Same as `(item item ..)?`, but when `maybe_placeholders=True`, generates `None` if there is no match. +* `[item | item | ..]` - Maybe with alternates. Same as `(item | item | ..)?`, but when `maybe_placeholders=True`, generates `None` if there is no match. +* `item?` - Zero or one instances of item (a "maybe") +* `item*` - Zero or more instances of item +* `item+` - One or more instances of item +* `item ~ n` - Exactly *n* instances of item +* `item ~ n..m` - Between *n* to *m* instances of item (not recommended for wide ranges, due to performance issues) Terminals are a linear construct, and therefore may not contain themselves (recursion isn't allowed). ### Templates -Templates are expanded when preprocessing the grammar. +Templates are expanded when preprocessing rules in the grammar. Templates are not allowed with terminals. Definition syntax: @@ -122,7 +133,7 @@ SIGNED_INTEGER: / /x ``` -Supported flags are one of: `imslux`. See Python's regex documentation for more details on each one. +Supported flags are one of: `imslux`. See Python's [regex documentation](https://docs.python.org/3/library/re.html#regular-expression-syntax) for more details on each one. Regexps/strings of different flags can only be concatenated in Python 3.6+ @@ -196,25 +207,32 @@ _ambig **Syntax:** ```html - : [-> ] + : [-> ] | ... ``` -Names of rules and aliases are always in lowercase. +Names of rules and aliases are always in lowercase. They must start with an underscore (`_`) or a letter (`a` through `z`), and may be composed of letters, underscores, and digits (`0` through `9`). Rule names that start with "_" will be inlined into their containing rule. Rule definitions can be extended to the next line by using the OR operator (signified by a pipe: `|` ). -An alias is a name for the specific rule alternative. It affects tree construction. +An alias is a name for the specific rule alternative. It affects tree construction (see [Shaping the tree](tree_construction#shaping_the_tree). +The affect of a rule on the parse tree can be specified by modifiers. The `!` modifier causes the rule to keep all its tokens, regardless of whether they are named or not. The `?` modifier causes the rule to be inlined if it only has a single child. The `?` modifier cannot be used on rules that are named starting with an underscore. Each item is one of: * `rule` * `TERMINAL` -* `"string literal"` or `/regexp literal/` +* `"string literal"` - Literal, to be matched as-is. +* `"string literal"i` - Literal, to be matched case-insensitively. +* `/regexp literal/` - Regular expression literal. Can inclde flags. +* `"character".."character"` - Literal range. The range represends all values between the two literals, inclusively. +* template(parameter1, parameter2, ..) - A template to be expanded with the specified parameters. * `(item item ..)` - Group items +* `(item | item | ..)` - Alternate items. Note that the items cannot have aliases. * `[item item ..]` - Maybe. Same as `(item item ..)?`, but when `maybe_placeholders=True`, generates `None` if there is no match. -* `item?` - Zero or one instances of item ("maybe") +* `[item | item | ..]` - Maybe with alternates. Same as `(item | item | ..)?`, but when `maybe_placeholders=True`, generates `None` if there is no match. Note that the items cannot have aliases. +* `item?` - Zero or one instances of item (a "maybe") * `item*` - Zero or more instances of item * `item+` - One or more instances of item * `item ~ n` - Exactly *n* instances of item @@ -297,12 +315,24 @@ Note that `%ignore` directives cannot be imported. Imported rules will abide by Declare a terminal without defining it. Useful for plugins. +**Syntax:** +```html +%declare +%declare +``` + ### %override Override a rule or terminals, affecting all references to it, even in imported grammars. Useful for implementing an inheritance pattern when importing grammars. +**Syntax:** +```html +%override ... terminal definition ... +%override ... rule definition ... +``` + **Example:** ```perl %import my_grammar (start, number, NUMBER) @@ -319,6 +349,12 @@ Useful for splitting up a definition of a complex rule with many different optio Can also be used to implement a plugin system where a core grammar is extended by others. +**Syntax:** +```html +%extend ... additional terminal alternate ... +%extend ... additional rule alternate ... +``` + **Example:** ```perl diff --git a/docs/tree_construction.md b/docs/tree_construction.md index 360b1ecab..05690ca3f 100644 --- a/docs/tree_construction.md +++ b/docs/tree_construction.md @@ -74,6 +74,7 @@ Lark will parse "((hello world))" as: The brackets do not appear in the tree by design. The words appear because they are matched by a named terminal. + ## Shaping the tree Users can alter the automatic construction of the tree using a collection of grammar features. diff --git a/lark/grammars/lark.lark b/lark/grammars/lark.lark index cdb4d1ca7..41cc06461 100644 --- a/lark/grammars/lark.lark +++ b/lark/grammars/lark.lark @@ -7,46 +7,66 @@ _item: rule | token | statement -rule: RULE rule_params priority? ":" expansions -token: TOKEN token_params priority? ":" expansions +rule: RULE_MODIFIERS? RULE rule_params priority? ":" rule_expansions +token: TOKEN priority? ":" token_expansions rule_params: ["{" RULE ("," RULE)* "}"] -token_params: ["{" TOKEN ("," TOKEN)* "}"] priority: "." NUMBER -statement: "%ignore" expansions -> ignore +statement: "%ignore" ignore_token -> ignore | "%import" import_path ["->" name] -> import | "%import" import_path name_list -> multi_import | "%override" rule -> override_rule + | "%override" token -> override_token | "%declare" name+ -> declare + | "%extend" rule -> extend_rule + | "%extend" token -> extend_token + +ignore_token: ignore_item [ OP | "~" NUMBER [".." NUMBER]] +ignore_item: STRING | TOKEN | REGEXP !import_path: "."? name ("." name)* name_list: "(" name ("," name)* ")" -?expansions: alias (_VBAR alias)* +?rule_expansions: rule_alias (_VBAR rule_alias)* + +?rule_inner_expansions: rule_expansion (_VBAR rule_expansion)* + +?rule_alias: rule_expansion ["->" RULE] + +?rule_expansion: rule_expr* + +?rule_expr: rule_atom [OP | "~" NUMBER [".." NUMBER]] +?rule_atom: "(" rule_inner_expansions ")" + | "[" rule_inner_expansions "]" -> rule_maybe + | rule_value + +?rule_value: RULE "{" rule_value ("," rule_value)* "}" -> rule_template_usage + | RULE + | token_value -?alias: expansion ["->" RULE] +?token_expansions: token_expansion (_VBAR token_expansion)* -?expansion: expr* +?token_expansion: token_expr* -?expr: atom [OP | "~" NUMBER [".." NUMBER]] +?token_expr: token_atom [OP | "~" NUMBER [".." NUMBER]] -?atom: "(" expansions ")" - | "[" expansions "]" -> maybe - | value +?token_atom: "(" token_expansions ")" + | "[" token_expansions "]" -> token_maybe + | token_value -?value: STRING ".." STRING -> literal_range - | name - | (REGEXP | STRING) -> literal - | name "{" value ("," value)* "}" -> template_usage +?token_value: STRING ".." STRING -> literal_range + | TOKEN + | (REGEXP | STRING) -> literal name: RULE | TOKEN _VBAR: _NL? "|" OP: /[+*]|[?](?![a-z])/ -RULE: /!?[_?]?[a-z][_a-z0-9]*/ +RULE: /_?[a-z][_a-z0-9]*/ +RULE_MODIFIERS: /!|![?](?=[a-z])|[?]!?(?=[a-z])/ TOKEN: /_?[A-Z][_A-Z0-9]*/ STRING: _STRING "i"? REGEXP: /\/(?!\/)(\\\/|\\\\|[^\/])*?\/[imslux]*/ diff --git a/tests/test_grammar_formal.py b/tests/test_grammar_formal.py new file mode 100644 index 000000000..2253510a2 --- /dev/null +++ b/tests/test_grammar_formal.py @@ -0,0 +1,169 @@ +from __future__ import absolute_import + +import os +from unittest import TestCase, main + +from lark import lark, Lark, UnexpectedToken +from lark.load_grammar import GrammarError + + +# Based on TestGrammar, with lots of tests that can't be run elided. +class TestGrammarFormal(TestCase): + def setUp(self): + lark_path = os.path.join(os.path.dirname(lark.__file__), 'grammars/lark.lark') + # lark_path = os.path.join(os.path.dirname(lark.__file__), 'grammars/lark.lark-ORIG') + with open(lark_path, 'r') as f: + self.lark_grammar = "\n".join(f.readlines()) + + def test_errors(self): + # raise NotImplementedError("Doesn't work yet.") + l = Lark(self.lark_grammar, parser="lalr") + + # This is an unrolled form of the test_grammar.py:GRAMMAR_ERRORS tests, because the lark.lark messages vary. + + # 'Incorrect type of value', 'a: 1\n' + self.assertRaisesRegex(UnexpectedToken, 'Unexpected token Token..NUMBER., .1..', l.parse, 'a: 1\n') + # 'Unclosed parenthesis', 'a: (\n' + self.assertRaisesRegex(UnexpectedToken, 'Unexpected token Token.._NL.,', l.parse, 'a: (\n') + # 'Unmatched closing parenthesis', 'a: )\n' + self.assertRaisesRegex(UnexpectedToken, 'Unexpected token Token..RPAR.', l.parse, 'a: )\n') + # 'Unmatched closing parenthesis', 'a: )\n' + self.assertRaisesRegex(UnexpectedToken, 'Unexpected token Token..RPAR.,', l.parse, 'a: )\n') + # 'Unmatched closing parenthesis', 'a: (\n' + self.assertRaisesRegex(UnexpectedToken, 'Unexpected token Token.._NL.,', l.parse, 'a: (\n') + # 'Expecting rule or terminal definition (missing colon)', 'a\n' + self.assertRaisesRegex(UnexpectedToken, 'Unexpected token Token.._NL.,', l.parse, 'a\n') + # 'Expecting rule or terminal definition (missing colon)', 'A\n' + self.assertRaisesRegex(UnexpectedToken, 'Unexpected token Token.._NL.,', l.parse, 'A\n') + # 'Expecting rule or terminal definition (missing colon)', 'a->\n' + self.assertRaisesRegex(UnexpectedToken, 'Unexpected token Token..__ANON_0., .->', l.parse, 'a->\n') + # 'Expecting rule or terminal definition (missing colon)', 'A->\n' + self.assertRaisesRegex(UnexpectedToken, 'Unexpected token Token..__ANON_0., .->', l.parse, 'A->\n') + # 'Expecting rule or terminal definition (missing colon)', 'a A\n' + self.assertRaisesRegex(UnexpectedToken, 'Unexpected token Token..TOKEN., .A..', l.parse, 'a A\n') + # 'Illegal name for rules or terminals', 'Aa:\n' + self.assertRaisesRegex(UnexpectedToken, 'Unexpected token Token..RULE., .a..', l.parse, 'Aa:\n') + # 'Alias expects lowercase name', 'a: -> "a"\n' + self.assertRaisesRegex(UnexpectedToken, 'Unexpected token Token..STRING., ."a"..', l.parse, 'a: -> "a"\n') + # 'Unexpected colon', 'a::\n' + self.assertRaisesRegex(UnexpectedToken, 'Unexpected token Token..COLON.,', l.parse, 'a::\n') + # 'Unexpected colon', 'a: b:\n' + self.assertRaisesRegex(UnexpectedToken, 'Unexpected token Token..COLON.,', l.parse, 'a: b:\n') + # 'Unexpected colon', 'a: B:\n' + self.assertRaisesRegex(UnexpectedToken, 'Unexpected token Token..COLON.,', l.parse, 'a: B:\n') + # 'Unexpected colon', 'a: "a":\n' + self.assertRaisesRegex(UnexpectedToken, 'Unexpected token Token..COLON.,', l.parse, 'a: "a":\n') + # 'Misplaced operator', 'a: b??' + self.assertRaisesRegex(UnexpectedToken, 'Unexpected token Token..OP., .\?..', l.parse, 'a: b??') + # 'Misplaced operator', 'a: b(?)' + self.assertRaisesRegex(UnexpectedToken, 'Unexpected token Token..OP., .\?..', l.parse, 'a: b(?)') + # 'Misplaced operator', 'a:+\n' + self.assertRaisesRegex(UnexpectedToken, 'Unexpected token Token..OP., .\+..', l.parse, 'a:+\n') + # 'Misplaced operator', 'a:?\n' + self.assertRaisesRegex(UnexpectedToken, 'Unexpected token Token..OP., .\?..', l.parse, 'a:?\n') + # 'Misplaced operator', 'a:*\n' + self.assertRaisesRegex(UnexpectedToken, 'Unexpected token Token..OP., .\*..', l.parse, 'a:*\n') + # 'Misplaced operator', 'a:|*\n' + self.assertRaisesRegex(UnexpectedToken, 'Unexpected token Token..OP., .\*..', l.parse, 'a:|*\n') + # 'Expecting option ("|") or a new rule or terminal definition', 'a:a\n()\n' + self.assertRaisesRegex(UnexpectedToken, 'Unexpected token Token..LPAR.,', l.parse, 'a:a\n()\n') + # 'Terminal names cannot contain dots', 'A.B\n' + self.assertRaisesRegex(UnexpectedToken, 'Unexpected token Token..TOKEN., .B..', l.parse, 'A.B\n') + # 'Expecting rule or terminal definition', '"a"\n' + self.assertRaisesRegex(UnexpectedToken, 'Unexpected token Token..STRING., ."a"..', l.parse, '"a"\n') + # '%import expects a name', '%import "a"\n' + self.assertRaisesRegex(UnexpectedToken, 'Unexpected token Token..STRING., ."a"..', l.parse, '%import "a"\n') + # '%ignore expects a value', '%ignore %import\n' + self.assertRaisesRegex(UnexpectedToken, 'Unexpected token Token..__ANON_2., .%import..', l.parse, '%ignore %import\n') + + # def test_empty_literal(self): + # raise NotImplementedError("Breaks tests/test_parser.py:_TestParser:test_backslash2().") + + # def test_ignore_name(self): + # raise NotImplementedError("Can't parse using parsed grammar.") + + # def test_override_rule_1(self): + # raise NotImplementedError("Can't parse using parsed grammar.") + + # def test_override_rule_2(self): + # raise NotImplementedError("Can't test semantics of grammar, only syntax.") + + # def test_override_rule_3(self): + # raise NotImplementedError("Can't test semantics of grammar, only syntax.") + + # def test_override_terminal(self): + # raise NotImplementedError("Can't parse using parsed grammar.") + + # def test_extend_rule_1(self): + # raise NotImplementedError("Can't parse using parsed grammar.") + + # def test_extend_rule_2(self): + # raise NotImplementedError("Can't test semantics of grammar, only syntax.") + + # def test_extend_term(self): + # raise NotImplementedError("Can't parse using parsed grammar.") + + # def test_extend_twice(self): + # raise NotImplementedError("Can't parse using parsed grammar.") + + # def test_undefined_ignore(self): + # raise NotImplementedError("Can't parse using parsed grammar.") + + def test_alias_in_terminal(self): + l = Lark(self.lark_grammar, parser="lalr") + g = """start: TERM + TERM: "a" -> alias + """ + # self.assertRaisesRegex( GrammarError, "Aliasing not allowed in terminals", Lark, g) + self.assertRaisesRegex( UnexpectedToken, "Unexpected token Token.'__ANON_0', '->'.", l.parse, g) + + # def test_undefined_rule(self): + # raise NotImplementedError("Can't test semantics of grammar, only syntax.") + + # def test_undefined_term(self): + # raise NotImplementedError("Can't test semantics of grammar, only syntax.") + + # def test_token_multiline_only_works_with_x_flag(self): + # raise NotImplementedError("Can't test regex flags in Lark grammar.") + + # def test_import_custom_sources(self): + # raise NotImplementedError("Can't parse using parsed grammar.") + + # def test_import_custom_sources2(self): + # raise NotImplementedError("Can't parse using parsed grammar.") + + # def test_import_custom_sources3(self): + # raise NotImplementedError("Can't parse using parsed grammar.") + + # def test_my_find_grammar_errors(self): + # raise NotImplementedError("Can't parse using parsed grammar.") + + # def test_ranged_repeat_terms(self): + # raise NotImplementedError("Can't parse using parsed grammar.") + + # def test_ranged_repeat_large(self): + # raise NotImplementedError("Can't parse using parsed grammar.") + + # def test_large_terminal(self): + # raise NotImplementedError("Can't parse using parsed grammar.") + + # def test_list_grammar_imports(self): + # raise NotImplementedError("Can't test semantics of grammar, only syntax.") + + def test_inline_with_expand_single(self): + l = Lark(self.lark_grammar, parser="lalr") + grammar = r""" + start: _a + !?_a: "A" + """ + # self.assertRaisesRegex(GrammarError, "Inlined rules (_rule) cannot use the ?rule modifier.", l.parse, grammar) + # TODO Is this really catching the right problem? + self.assertRaisesRegex(UnexpectedToken, "Unexpected token Token.'OP', '?'.", l.parse, grammar) + + + # def test_line_breaks(self): + # raise NotImplementedError("Can't parse using parsed grammar.") + + +if __name__ == '__main__': + main() diff --git a/tests/test_lark_lark.py b/tests/test_lark_lark.py new file mode 100644 index 000000000..2183530f7 --- /dev/null +++ b/tests/test_lark_lark.py @@ -0,0 +1,165 @@ +from __future__ import absolute_import + +import os +from unittest import TestCase, main + +from lark import lark, Lark, UnexpectedToken +from lark.load_grammar import GrammarError + + +# Test that certain previous differences between load_grammar.py and +# grammars/lark.lark have been resolved. +class TestLarkLark(TestCase): + def setUp(self): + lark_path = os.path.join(os.path.dirname(lark.__file__), 'grammars/lark.lark') + # lark_path = os.path.join(os.path.dirname(lark.__file__), 'grammars/lark.lark-ORIG') + self.lark_parser = Lark.open(lark_path, parser="lalr") + + def test_01_no_alias_in_terminal_lg(self): + g = """start: TERM + TERM: "a" -> alias + """ + self.assertRaisesRegex( GrammarError, "Aliasing not allowed in terminals", Lark, g) + + def test_01_no_alias_in_terminal_ll(self): + g = """start: TERM + TERM: "a" -> alias + """ + self.assertRaisesRegex( UnexpectedToken, "Unexpected token Token.'__ANON_0', '->'.", self.lark_parser.parse, g) + + def test_02_no_rule_aliases_below_top_level_lg(self): + g = """start: rule + rule: ("a" -> alias + | "b") + """ + self.assertRaisesRegex( GrammarError, "Rule 'alias' used but not defined", Lark, g) + + def test_02_no_rule_aliases_below_top_level_ll(self): + g = """start: rule + rule: ("a" -> alias + | "b") + """ + self.assertRaisesRegex( UnexpectedToken, "Unexpected token Token.'__ANON_0', '->'.", self.lark_parser.parse, g) + + def test_03_ignore_single_token_lg(self): + g = """start: TERM + %ignore "a" "b" /c/ + TERM: "d" + """ + # This SHOULD raise some sort of error, but silently discards the extra tokens instead. + # self.assertRaises( UnexpectedToken, Lark, g) + Lark(g) + + def test_03_ignore_single_token_ll(self): + g = """start: TERM + %ignore "a" "b" /c/ + TERM: "d" + """ + self.assertRaisesRegex( UnexpectedToken, "Unexpected token Token.'STRING', '.b.'.", self.lark_parser.parse, g) + + def test_04_extend_rule_lg(self): + g = """ + %import .grammars.ab (startab, A, B, expr) + + %extend expr: B A + """ + Lark(g, start='startab', source_path=__file__) + + def test_04_extend_rule_ll(self): + g = """ + %import .grammars.ab (startab, A, B, expr) + + %extend expr: B A + """ + self.lark_parser.parse(g) + + def test_05_extend_term_lg(self): + g = """ + %import .grammars.ab (startab, A, B, expr) + + %extend A: "c" + """ + Lark(g, start='startab', source_path=__file__) + + def test_05_extend_term_ll(self): + g = """ + %import .grammars.ab (startab, A, B, expr) + + %extend A: "c" + """ + self.lark_parser.parse(g) + + def test_06_no_term_templates_lg(self): + g = """start: TERM + separated{x, sep}: x (sep x)* + TERM: separated{"A", " "} + """ + self.assertRaises( AssertionError, Lark, g) + + def test_06_no_term_templates_ll(self): + g = """start: TERM + separated{x, sep}: x (sep x)* + TERM: separated{"A", " "} + """ + self.assertRaisesRegex( UnexpectedToken, "Unexpected token Token.'RULE', 'separated'.", self.lark_parser.parse, g) + + def test_07_term_no_call_rule_lg(self): + g = """start: TERM + TERM: rule + rule: "a" + """ + self.assertRaisesRegex( GrammarError, "Rules aren't allowed inside terminals", Lark, g) + + def test_07_term_no_call_rule_ll(self): + g = """start: TERM + TERM: rule + rule: "a" + """ + self.assertRaisesRegex( UnexpectedToken, "Unexpected token Token.'RULE', 'rule'.", self.lark_parser.parse, g) + + def test_08_override_term_lg(self): + g = """ + %import .grammars.ab (startab, A, B, expr) + + %override A: "c" + """ + Lark(g, start='startab', source_path=__file__) + + def test_08_override_term_ll(self): + g = """ + %import .grammars.ab (startab, A, B, expr) + + %override A: "c" + """ + self.lark_parser.parse(g) + + def test_09_no_rule_modifiers_in_references_lg(self): + g = """start: rule1 + rule1: !?rule2 + rule2: "a" + """ + self.assertRaisesRegex(GrammarError, "Expecting a value, at line 2 column 20", Lark, g) + + def test_09_no_rule_modifiers_in_references_ll(self): + g = """start: rule1 + rule1: !rule2 + rule2: "a" + """ + self.assertRaisesRegex( UnexpectedToken, "Unexpected token Token.'RULE_MODIFIERS', '!'.", self.lark_parser.parse, g) + + def test_10_rule_modifier_query_bang_lg(self): + g = """start: rule1 + rule1: rule2 + ?!rule2: "a" + """ + Lark(g) + + def test_10_rule_modifier_query_bang_ll(self): + g = """start: rule1 + rule1: rule2 + ?!rule2: "a" + """ + self.lark_parser.parse(g) + +if __name__ == '__main__': + main() From 9493f81e9eab6dbe42096938bbcfc1f0cf5a2135 Mon Sep 17 00:00:00 2001 From: Ross Patterson Date: Thu, 1 Feb 2024 14:02:59 -0500 Subject: [PATCH 02/21] 1. Fix "Python type check / Format (pull request)" failure in test_lark_lark.py (trailing whitespace in test_06_*()) 2. Remove what was left of "Literals can be one of: ..." under "Terminals" in grammar.md. 3. Address @erezsh's point about inlined terminals under "Terminals" in grammar.md. 4. Remove "lark.lark-ORIG" references in test_lark_lark.py and test_grammar_formal.py that shouldn't have been pushed. 5. Address @erezsh's point about f.readlines() in test_grammar_formal.py. 6. Address @erezsh's point commented-out tests in test_grammar_formal.py. --- docs/grammar.md | 6 +-- tests/test_grammar_formal.py | 77 +----------------------------------- tests/test_lark_lark.py | 5 +-- 3 files changed, 5 insertions(+), 83 deletions(-) diff --git a/docs/grammar.md b/docs/grammar.md index a005f3143..a748f0d98 100644 --- a/docs/grammar.md +++ b/docs/grammar.md @@ -62,11 +62,7 @@ Terminals are used to match text into symbols. They can be defined as a combinat [. ] : ``` -Terminal names must be uppercase. They must start with an underscore (`_`) or a letter (`A` through `Z`), and may be composed of letters, underscores, and digits (`0` through `9`). Terminal names that start with "_" will not be included in the parse tree, unless the `keep_all_tokens` option is specified. - -Literals can be one of: - -* Literal range: `"a".."z"`, `"1".."9"`, etc. - Each literal must be a single character, and the range represends all values between the two literals, inclusively. +Terminal names must be uppercase. They must start with an underscore (`_`) or a letter (`A` through `Z`), and may be composed of letters, underscores, and digits (`0` through `9`). Terminal names that start with "_" will not be included in the parse tree, unless the `keep_all_tokens` option is specified, or unless they are part of a containing terminal. Each item is one of: diff --git a/tests/test_grammar_formal.py b/tests/test_grammar_formal.py index 2253510a2..71311b435 100644 --- a/tests/test_grammar_formal.py +++ b/tests/test_grammar_formal.py @@ -7,16 +7,14 @@ from lark.load_grammar import GrammarError -# Based on TestGrammar, with lots of tests that can't be run elided. +# Based on TestGrammar, with lots of tests that can't be run deleted. class TestGrammarFormal(TestCase): def setUp(self): lark_path = os.path.join(os.path.dirname(lark.__file__), 'grammars/lark.lark') - # lark_path = os.path.join(os.path.dirname(lark.__file__), 'grammars/lark.lark-ORIG') with open(lark_path, 'r') as f: - self.lark_grammar = "\n".join(f.readlines()) + self.lark_grammar = f.read()) def test_errors(self): - # raise NotImplementedError("Doesn't work yet.") l = Lark(self.lark_grammar, parser="lalr") # This is an unrolled form of the test_grammar.py:GRAMMAR_ERRORS tests, because the lark.lark messages vary. @@ -76,39 +74,6 @@ def test_errors(self): # '%ignore expects a value', '%ignore %import\n' self.assertRaisesRegex(UnexpectedToken, 'Unexpected token Token..__ANON_2., .%import..', l.parse, '%ignore %import\n') - # def test_empty_literal(self): - # raise NotImplementedError("Breaks tests/test_parser.py:_TestParser:test_backslash2().") - - # def test_ignore_name(self): - # raise NotImplementedError("Can't parse using parsed grammar.") - - # def test_override_rule_1(self): - # raise NotImplementedError("Can't parse using parsed grammar.") - - # def test_override_rule_2(self): - # raise NotImplementedError("Can't test semantics of grammar, only syntax.") - - # def test_override_rule_3(self): - # raise NotImplementedError("Can't test semantics of grammar, only syntax.") - - # def test_override_terminal(self): - # raise NotImplementedError("Can't parse using parsed grammar.") - - # def test_extend_rule_1(self): - # raise NotImplementedError("Can't parse using parsed grammar.") - - # def test_extend_rule_2(self): - # raise NotImplementedError("Can't test semantics of grammar, only syntax.") - - # def test_extend_term(self): - # raise NotImplementedError("Can't parse using parsed grammar.") - - # def test_extend_twice(self): - # raise NotImplementedError("Can't parse using parsed grammar.") - - # def test_undefined_ignore(self): - # raise NotImplementedError("Can't parse using parsed grammar.") - def test_alias_in_terminal(self): l = Lark(self.lark_grammar, parser="lalr") g = """start: TERM @@ -117,39 +82,6 @@ def test_alias_in_terminal(self): # self.assertRaisesRegex( GrammarError, "Aliasing not allowed in terminals", Lark, g) self.assertRaisesRegex( UnexpectedToken, "Unexpected token Token.'__ANON_0', '->'.", l.parse, g) - # def test_undefined_rule(self): - # raise NotImplementedError("Can't test semantics of grammar, only syntax.") - - # def test_undefined_term(self): - # raise NotImplementedError("Can't test semantics of grammar, only syntax.") - - # def test_token_multiline_only_works_with_x_flag(self): - # raise NotImplementedError("Can't test regex flags in Lark grammar.") - - # def test_import_custom_sources(self): - # raise NotImplementedError("Can't parse using parsed grammar.") - - # def test_import_custom_sources2(self): - # raise NotImplementedError("Can't parse using parsed grammar.") - - # def test_import_custom_sources3(self): - # raise NotImplementedError("Can't parse using parsed grammar.") - - # def test_my_find_grammar_errors(self): - # raise NotImplementedError("Can't parse using parsed grammar.") - - # def test_ranged_repeat_terms(self): - # raise NotImplementedError("Can't parse using parsed grammar.") - - # def test_ranged_repeat_large(self): - # raise NotImplementedError("Can't parse using parsed grammar.") - - # def test_large_terminal(self): - # raise NotImplementedError("Can't parse using parsed grammar.") - - # def test_list_grammar_imports(self): - # raise NotImplementedError("Can't test semantics of grammar, only syntax.") - def test_inline_with_expand_single(self): l = Lark(self.lark_grammar, parser="lalr") grammar = r""" @@ -157,13 +89,8 @@ def test_inline_with_expand_single(self): !?_a: "A" """ # self.assertRaisesRegex(GrammarError, "Inlined rules (_rule) cannot use the ?rule modifier.", l.parse, grammar) - # TODO Is this really catching the right problem? self.assertRaisesRegex(UnexpectedToken, "Unexpected token Token.'OP', '?'.", l.parse, grammar) - # def test_line_breaks(self): - # raise NotImplementedError("Can't parse using parsed grammar.") - - if __name__ == '__main__': main() diff --git a/tests/test_lark_lark.py b/tests/test_lark_lark.py index 2183530f7..c3d2124bd 100644 --- a/tests/test_lark_lark.py +++ b/tests/test_lark_lark.py @@ -12,7 +12,6 @@ class TestLarkLark(TestCase): def setUp(self): lark_path = os.path.join(os.path.dirname(lark.__file__), 'grammars/lark.lark') - # lark_path = os.path.join(os.path.dirname(lark.__file__), 'grammars/lark.lark-ORIG') self.lark_parser = Lark.open(lark_path, parser="lalr") def test_01_no_alias_in_terminal_lg(self): @@ -91,14 +90,14 @@ def test_05_extend_term_ll(self): def test_06_no_term_templates_lg(self): g = """start: TERM - separated{x, sep}: x (sep x)* + separated{x, sep}: x (sep x)* TERM: separated{"A", " "} """ self.assertRaises( AssertionError, Lark, g) def test_06_no_term_templates_ll(self): g = """start: TERM - separated{x, sep}: x (sep x)* + separated{x, sep}: x (sep x)* TERM: separated{"A", " "} """ self.assertRaisesRegex( UnexpectedToken, "Unexpected token Token.'RULE', 'separated'.", self.lark_parser.parse, g) From 7a2880f06615e2e22f28b72ac608f33afbf4bd66 Mon Sep 17 00:00:00 2001 From: Ross Patterson Date: Thu, 1 Feb 2024 14:33:37 -0500 Subject: [PATCH 03/21] DOH! --- tests/test_grammar_formal.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/test_grammar_formal.py b/tests/test_grammar_formal.py index 71311b435..6701c49e9 100644 --- a/tests/test_grammar_formal.py +++ b/tests/test_grammar_formal.py @@ -12,7 +12,7 @@ class TestGrammarFormal(TestCase): def setUp(self): lark_path = os.path.join(os.path.dirname(lark.__file__), 'grammars/lark.lark') with open(lark_path, 'r') as f: - self.lark_grammar = f.read()) + self.lark_grammar = f.read() def test_errors(self): l = Lark(self.lark_grammar, parser="lalr") From 83a374f5d00cd1f6bf3b43d5bf799cf7ef804cb3 Mon Sep 17 00:00:00 2001 From: Ross Patterson Date: Fri, 2 Feb 2024 11:19:00 -0500 Subject: [PATCH 04/21] Remove unnessary anchor; coalesce ENBF item sets; fix %override grammar --- docs/grammar.md | 55 ++++++++++++++++----------------------- docs/tree_construction.md | 1 - 2 files changed, 22 insertions(+), 34 deletions(-) diff --git a/docs/grammar.md b/docs/grammar.md index a748f0d98..ef61ee666 100644 --- a/docs/grammar.md +++ b/docs/grammar.md @@ -51,19 +51,9 @@ Lark begins the parse with the rule 'start', unless specified otherwise in the o Names of rules are always in lowercase, while names of terminals are always in uppercase. This distinction has practical effects, for the shape of the generated parse-tree, and the automatic construction of the lexer (aka tokenizer, or scanner). +## EBNF Expressions -## Terminals - -Terminals are used to match text into symbols. They can be defined as a combination of literals and other terminals. - -**Syntax:** - -```html - [. ] : -``` - -Terminal names must be uppercase. They must start with an underscore (`_`) or a letter (`A` through `Z`), and may be composed of letters, underscores, and digits (`0` through `9`). Terminal names that start with "_" will not be included in the parse tree, unless the `keep_all_tokens` option is specified, or unless they are part of a containing terminal. - +The EBNF expression in a Lark termminal definition is a sequence of items to be matched. Each item is one of: * `TERMINAL` - Another terminal, which cannot be defined in terms of this terminal. @@ -81,7 +71,23 @@ Each item is one of: * `item ~ n` - Exactly *n* instances of item * `item ~ n..m` - Between *n* to *m* instances of item (not recommended for wide ranges, due to performance issues) -Terminals are a linear construct, and therefore may not contain themselves (recursion isn't allowed). +The EBNF expression in a Lark rule definition is also a sequence of the same set of items to be matched, with one addition: + +* `rule` - A rule, which can include recursive use of this rule. + +## Terminals + +Terminals are used to match text into symbols. They can be defined as a combination of literals and other terminals. + +**Syntax:** + +```html + [. ] : +``` + +Terminal names must be uppercase. They must start with an underscore (`_`) or a letter (`A` through `Z`), and may be composed of letters, underscores, and digits (`0` through `9`). Terminal names that start with "_" will not be included in the parse tree, unless the `keep_all_tokens` option is specified, or unless they are part of a containing terminal. Terminals are a linear construct, and therefore may not contain themselves (recursion isn't allowed). + +See [EBNF Expressions](#ebnf-expressions) above for the list of items that a terminal can match. ### Templates @@ -215,24 +221,7 @@ An alias is a name for the specific rule alternative. It affects tree constructi The affect of a rule on the parse tree can be specified by modifiers. The `!` modifier causes the rule to keep all its tokens, regardless of whether they are named or not. The `?` modifier causes the rule to be inlined if it only has a single child. The `?` modifier cannot be used on rules that are named starting with an underscore. -Each item is one of: - -* `rule` -* `TERMINAL` -* `"string literal"` - Literal, to be matched as-is. -* `"string literal"i` - Literal, to be matched case-insensitively. -* `/regexp literal/` - Regular expression literal. Can inclde flags. -* `"character".."character"` - Literal range. The range represends all values between the two literals, inclusively. -* template(parameter1, parameter2, ..) - A template to be expanded with the specified parameters. -* `(item item ..)` - Group items -* `(item | item | ..)` - Alternate items. Note that the items cannot have aliases. -* `[item item ..]` - Maybe. Same as `(item item ..)?`, but when `maybe_placeholders=True`, generates `None` if there is no match. -* `[item | item | ..]` - Maybe with alternates. Same as `(item | item | ..)?`, but when `maybe_placeholders=True`, generates `None` if there is no match. Note that the items cannot have aliases. -* `item?` - Zero or one instances of item (a "maybe") -* `item*` - Zero or more instances of item -* `item+` - One or more instances of item -* `item ~ n` - Exactly *n* instances of item -* `item ~ n..m` - Between *n* to *m* instances of item (not recommended for wide ranges, due to performance issues) +See [EBNF Expressions](#ebnf_expressions) above for the list of items that a rule can match. **Examples:** ```perl @@ -325,8 +314,8 @@ Useful for implementing an inheritance pattern when importing grammars. **Syntax:** ```html -%override ... terminal definition ... -%override ... rule definition ... +%override +%override ``` **Example:** diff --git a/docs/tree_construction.md b/docs/tree_construction.md index 05690ca3f..360b1ecab 100644 --- a/docs/tree_construction.md +++ b/docs/tree_construction.md @@ -74,7 +74,6 @@ Lark will parse "((hello world))" as: The brackets do not appear in the tree by design. The words appear because they are matched by a named terminal. - ## Shaping the tree Users can alter the automatic construction of the tree using a collection of grammar features. From fdffb5fe7742a09be3a17b1575827fd4ee1763be Mon Sep 17 00:00:00 2001 From: Ross Patterson Date: Thu, 8 Feb 2024 21:21:18 -0500 Subject: [PATCH 05/21] Revert lark.lark to its original form. --- lark/grammars/lark.lark | 52 +++++++++++++---------------------------- 1 file changed, 16 insertions(+), 36 deletions(-) diff --git a/lark/grammars/lark.lark b/lark/grammars/lark.lark index 41cc06461..cdb4d1ca7 100644 --- a/lark/grammars/lark.lark +++ b/lark/grammars/lark.lark @@ -7,66 +7,46 @@ _item: rule | token | statement -rule: RULE_MODIFIERS? RULE rule_params priority? ":" rule_expansions -token: TOKEN priority? ":" token_expansions +rule: RULE rule_params priority? ":" expansions +token: TOKEN token_params priority? ":" expansions rule_params: ["{" RULE ("," RULE)* "}"] +token_params: ["{" TOKEN ("," TOKEN)* "}"] priority: "." NUMBER -statement: "%ignore" ignore_token -> ignore +statement: "%ignore" expansions -> ignore | "%import" import_path ["->" name] -> import | "%import" import_path name_list -> multi_import | "%override" rule -> override_rule - | "%override" token -> override_token | "%declare" name+ -> declare - | "%extend" rule -> extend_rule - | "%extend" token -> extend_token - -ignore_token: ignore_item [ OP | "~" NUMBER [".." NUMBER]] -ignore_item: STRING | TOKEN | REGEXP !import_path: "."? name ("." name)* name_list: "(" name ("," name)* ")" -?rule_expansions: rule_alias (_VBAR rule_alias)* - -?rule_inner_expansions: rule_expansion (_VBAR rule_expansion)* - -?rule_alias: rule_expansion ["->" RULE] - -?rule_expansion: rule_expr* - -?rule_expr: rule_atom [OP | "~" NUMBER [".." NUMBER]] -?rule_atom: "(" rule_inner_expansions ")" - | "[" rule_inner_expansions "]" -> rule_maybe - | rule_value - -?rule_value: RULE "{" rule_value ("," rule_value)* "}" -> rule_template_usage - | RULE - | token_value +?expansions: alias (_VBAR alias)* -?token_expansions: token_expansion (_VBAR token_expansion)* +?alias: expansion ["->" RULE] -?token_expansion: token_expr* +?expansion: expr* -?token_expr: token_atom [OP | "~" NUMBER [".." NUMBER]] +?expr: atom [OP | "~" NUMBER [".." NUMBER]] -?token_atom: "(" token_expansions ")" - | "[" token_expansions "]" -> token_maybe - | token_value +?atom: "(" expansions ")" + | "[" expansions "]" -> maybe + | value -?token_value: STRING ".." STRING -> literal_range - | TOKEN - | (REGEXP | STRING) -> literal +?value: STRING ".." STRING -> literal_range + | name + | (REGEXP | STRING) -> literal + | name "{" value ("," value)* "}" -> template_usage name: RULE | TOKEN _VBAR: _NL? "|" OP: /[+*]|[?](?![a-z])/ -RULE: /_?[a-z][_a-z0-9]*/ -RULE_MODIFIERS: /!|![?](?=[a-z])|[?]!?(?=[a-z])/ +RULE: /!?[_?]?[a-z][_a-z0-9]*/ TOKEN: /_?[A-Z][_A-Z0-9]*/ STRING: _STRING "i"? REGEXP: /\/(?!\/)(\\\/|\\\\|[^\/])*?\/[imslux]*/ From 95c574234dd5052836fc1e6dfb95ccbae6df798b Mon Sep 17 00:00:00 2001 From: Ross Patterson Date: Thu, 8 Feb 2024 22:01:44 -0500 Subject: [PATCH 06/21] Make lark.lark accept the same input as load_grammar.py, and provide a visitor that enforces some of its restrictions. --- docs/grammar.md | 44 ++++++++-------- lark/grammars/lark.lark | 27 +++++++--- lark/lark_validator_visitor.py | 93 ++++++++++++++++++++++++++++++++++ tests/__main__.py | 2 + tests/test_ignore.py | 42 +++++++++++++++ tests/test_lark_lark.py | 91 +++++++++++++++++++++++++-------- 6 files changed, 251 insertions(+), 48 deletions(-) create mode 100644 lark/lark_validator_visitor.py create mode 100644 tests/test_ignore.py diff --git a/docs/grammar.md b/docs/grammar.md index ef61ee666..75fcc5961 100644 --- a/docs/grammar.md +++ b/docs/grammar.md @@ -91,26 +91,7 @@ See [EBNF Expressions](#ebnf-expressions) above for the list of items that a ter ### Templates -Templates are expanded when preprocessing rules in the grammar. Templates are not allowed with terminals. - -Definition syntax: - -```ebnf - my_template{param1, param2, ...}: -``` - -Use syntax: - -```ebnf -some_rule: my_template{arg1, arg2, ...} -``` - -Example: -```ebnf -_separated{x, sep}: x (sep x)* // Define a sequence of 'x sep x sep x ...' - -num_list: "[" _separated{NUMBER, ","} "]" // Will match "[1, 2, 3]" etc. -``` +Templates are not allowed with terminals. ### Priority @@ -233,6 +214,29 @@ expr: expr operator expr four_words: word ~ 4 ``` +### Templates + +Templates are expanded when preprocessing rules in the grammar. + +Definition syntax: + +```ebnf + my_template{param1, param2, ...}: +``` + +Use syntax: + +```ebnf +some_rule: my_template{arg1, arg2, ...} +``` + +Example: +```ebnf +_separated{x, sep}: x (sep x)* // Define a sequence of 'x sep x sep x ...' + +num_list: "[" _separated{NUMBER, ","} "]" // Will match "[1, 2, 3]" etc. +``` + ### Priority Like terminals, rules can be assigned a priority. Rule priorities are signed diff --git a/lark/grammars/lark.lark b/lark/grammars/lark.lark index cdb4d1ca7..83b8ad433 100644 --- a/lark/grammars/lark.lark +++ b/lark/grammars/lark.lark @@ -1,5 +1,17 @@ # Lark grammar of Lark's syntax # Note: Lark is not bootstrapped, its parser is implemented in load_grammar.py +# This grammar matches that one, but does not enfore some rules that it does. +# If you want to enforce those, you can pass the "LarkValidatorVisitor" over +# the parse tree, like this: + +# import os +# import lark +# from lark.lark_validator_visitor import LarkValidatorVisitor +# +# lark_path = os.path.join(os.path.dirname(lark.__file__), 'grammars/lark.lark') +# lark_parser = Lark.open(lark_path, parser="lalr") +# parse_tree = lark_parser.parse(my_grammar) +# LarkValidatorVisitor.validate(parse_tree) start: (_item? _NL)* _item? @@ -7,19 +19,21 @@ _item: rule | token | statement -rule: RULE rule_params priority? ":" expansions -token: TOKEN token_params priority? ":" expansions +rule: rule_modifiers? RULE rule_params priority? ":" expansions +token: TOKEN priority? ":" expansions + +rule_modifiers: RULE_MODIFIERS rule_params: ["{" RULE ("," RULE)* "}"] -token_params: ["{" TOKEN ("," TOKEN)* "}"] priority: "." NUMBER statement: "%ignore" expansions -> ignore | "%import" import_path ["->" name] -> import | "%import" import_path name_list -> multi_import - | "%override" rule -> override_rule + | "%override" (rule | token) -> override | "%declare" name+ -> declare + | "%extend" (rule | token) -> extend !import_path: "."? name ("." name)* name_list: "(" name ("," name)* ")" @@ -39,14 +53,15 @@ name_list: "(" name ("," name)* ")" ?value: STRING ".." STRING -> literal_range | name | (REGEXP | STRING) -> literal - | name "{" value ("," value)* "}" -> template_usage + | RULE "{" value ("," value)* "}" -> template_usage name: RULE | TOKEN _VBAR: _NL? "|" OP: /[+*]|[?](?![a-z])/ -RULE: /!?[_?]?[a-z][_a-z0-9]*/ +RULE_MODIFIERS: /(!|![?]?|[?]!?)(?=[_a-z])/ +RULE: /_?[a-z][_a-z0-9]*/ TOKEN: /_?[A-Z][_A-Z0-9]*/ STRING: _STRING "i"? REGEXP: /\/(?!\/)(\\\/|\\\\|[^\/])*?\/[imslux]*/ diff --git a/lark/lark_validator_visitor.py b/lark/lark_validator_visitor.py new file mode 100644 index 000000000..2e3ee82c9 --- /dev/null +++ b/lark/lark_validator_visitor.py @@ -0,0 +1,93 @@ +from .lexer import Token +from .load_grammar import GrammarError +from .visitors import Visitor +from .tree import Tree + +class LarkValidatorVisitor(Visitor): + + @classmethod + def validate(cls, tree: Tree): + visitor = cls() + visitor.visit(tree) + return tree + + def alias(self, tree: Tree): + # Reject alias names in inner 'expansions'. + self._reject_aliases(tree.children[0], "Deep aliasing not allowed") + + def ignore(self, tree: Tree): + # Reject everything except 'literal' and 'name' > 'TOKEN'. + assert len(tree.children) > 0 # The grammar should pass us some things to ignore. + if len(tree.children) > 1: + self._reject_bad_ignore() + node = tree.children[0] + if node.data == "expansions": + if len(node.children) > 1: + self._reject_bad_ignore() + node = node.children[0] + if node.data == "alias": + if len(node.children) > 1: + self._reject_bad_ignore() + node = node.children[0] + if node.data == "expansion": + if len(node.children) > 1: + self._reject_bad_ignore() + node = node.children[0] + if node.data == "expr": + if len(node.children) > 1: + self._reject_bad_ignore() + node = node.children[0] + if node.data == "atom": + if len(node.children) > 1: + self._reject_bad_ignore() + node = node.children[0] + if node.data == "literal": + return + elif node.data == "name": + if node.children[0].data == "TOKEN": + return + elif node.data == "value": + if node.children[0].data == "literal": + return + elif node.children[0].data == "name": + if node.children[0][0].data == "TOKEN": + return + self._reject_bad_ignore() + + def token(self, tree: Tree): + assert len(tree.children) > 1 # The grammar should pass us at least a token name and an item. + first_item = 2 if tree.children[1].data == "priority" else 1 + # Reject alias names in token definitions. + for child in tree.children[first_item:]: + self._reject_aliases(child, "Aliasing not allowed in terminals (You used -> in the wrong place)") + # Reject template usage in token definitions. We do this before checking rules + # because rule usage looks like template usage, just without parameters. + for child in tree.children[first_item:]: + self._reject_templates(child, "Templates not allowed in terminals") + # Reject rule references in token definitions. + for child in tree.children[first_item:]: + self._reject_rules(child, "Rules aren't allowed inside terminals") + + def _reject_aliases(self, item: Tree|Token, message: str): + if isinstance(item, Tree): + if item.data == "alias" and len(item.children) > 1 and item.children[1] is not None: + raise GrammarError(message) + for child in item.children: + self._reject_aliases(child, message) + + def _reject_bad_ignore(self): + raise GrammarError("Bad %ignore - must have a Terminal or other value.") + + def _reject_rules(self, item: Tree|Token, message: str): + if isinstance(item, Token) and item.type == "RULE": + raise GrammarError(message) + elif isinstance(item, Tree): + for child in item.children: + self._reject_rules(child, message) + + def _reject_templates(self, item: Tree|Token, message: str): + if isinstance(item, Tree): + if item.data == "template_usage": + raise GrammarError(message) + for child in item.children: + self._reject_templates(child, message) diff --git a/tests/__main__.py b/tests/__main__.py index c5298a770..80e51d21f 100644 --- a/tests/__main__.py +++ b/tests/__main__.py @@ -9,6 +9,8 @@ from .test_tools import TestStandalone from .test_cache import TestCache from .test_grammar import TestGrammar +from .test_lark_lark import TestLarkLark +from .test_ignore import TestIgnore from .test_reconstructor import TestReconstructor from .test_tree_forest_transformer import TestTreeForestTransformer from .test_lexer import TestLexer diff --git a/tests/test_ignore.py b/tests/test_ignore.py new file mode 100644 index 000000000..94712f1c7 --- /dev/null +++ b/tests/test_ignore.py @@ -0,0 +1,42 @@ +from __future__ import absolute_import + +import os +from unittest import TestCase, main + +from lark import lark, Lark, UnexpectedToken, UnexpectedCharacters +from lark.load_grammar import GrammarError +from lark.lark_validator_visitor import LarkValidatorVisitor + + +# Test that certain previous differences between load_grammar.py and +# grammars/lark.lark have been resolved. +class TestIgnore(TestCase): + def setUp(self): + lark_path = os.path.join(os.path.dirname(lark.__file__), 'grammars/lark.lark') + self.lark_parser = Lark.open(lark_path, parser="lalr") + + def test_load_grammar_import_multiple(self): + g = """ + %ignore A B + start: rule1 + rule1: "c" + A: "a" + B: "b" + """ + l = Lark(g) + self.assertRaisesRegex(UnexpectedCharacters, "No terminal matches 'b' in the current parser context", l.parse, "badbadbad") + + def test_lark_lark_ignore_multiple(self): + g = """ + %ignore A B + start: rule1 + rule1: "c" + A: "a" + B: "b" + """ + t = self.lark_parser.parse(g) + self.assertRaisesRegex(GrammarError, "Bad %ignore - must have a Terminal or other value", LarkValidatorVisitor.validate, t) + + +if __name__ == '__main__': + main() diff --git a/tests/test_lark_lark.py b/tests/test_lark_lark.py index c3d2124bd..4f3d1236f 100644 --- a/tests/test_lark_lark.py +++ b/tests/test_lark_lark.py @@ -5,9 +5,10 @@ from lark import lark, Lark, UnexpectedToken from lark.load_grammar import GrammarError +from lark.lark_validator_visitor import LarkValidatorVisitor -# Test that certain previous differences between load_grammar.py and +# Test that certain previous differences between load_grammar.py and # grammars/lark.lark have been resolved. class TestLarkLark(TestCase): def setUp(self): @@ -21,10 +22,11 @@ def test_01_no_alias_in_terminal_lg(self): self.assertRaisesRegex( GrammarError, "Aliasing not allowed in terminals", Lark, g) def test_01_no_alias_in_terminal_ll(self): + # lark.lark allows aliases in terminals, and rejects them if you run the LarkValidatorVisitor. g = """start: TERM TERM: "a" -> alias """ - self.assertRaisesRegex( UnexpectedToken, "Unexpected token Token.'__ANON_0', '->'.", self.lark_parser.parse, g) + self.lark_parser.parse(g) def test_02_no_rule_aliases_below_top_level_lg(self): g = """start: rule @@ -34,27 +36,12 @@ def test_02_no_rule_aliases_below_top_level_lg(self): self.assertRaisesRegex( GrammarError, "Rule 'alias' used but not defined", Lark, g) def test_02_no_rule_aliases_below_top_level_ll(self): + # lark.lark allows aliases below top-level, and rejects them if you run the LarkValidatorVisitor. g = """start: rule rule: ("a" -> alias | "b") """ - self.assertRaisesRegex( UnexpectedToken, "Unexpected token Token.'__ANON_0', '->'.", self.lark_parser.parse, g) - - def test_03_ignore_single_token_lg(self): - g = """start: TERM - %ignore "a" "b" /c/ - TERM: "d" - """ - # This SHOULD raise some sort of error, but silently discards the extra tokens instead. - # self.assertRaises( UnexpectedToken, Lark, g) - Lark(g) - - def test_03_ignore_single_token_ll(self): - g = """start: TERM - %ignore "a" "b" /c/ - TERM: "d" - """ - self.assertRaisesRegex( UnexpectedToken, "Unexpected token Token.'STRING', '.b.'.", self.lark_parser.parse, g) + self.lark_parser.parse(g) def test_04_extend_rule_lg(self): g = """ @@ -93,14 +80,15 @@ def test_06_no_term_templates_lg(self): separated{x, sep}: x (sep x)* TERM: separated{"A", " "} """ - self.assertRaises( AssertionError, Lark, g) + self.assertRaisesRegex( AssertionError, "Tree.'template_usage', .NonTerminal.'separated'.", Lark, g) def test_06_no_term_templates_ll(self): + # lark.lark allows templates in terminals, and rejects them if you run the LarkValidatorVisitor. g = """start: TERM separated{x, sep}: x (sep x)* TERM: separated{"A", " "} """ - self.assertRaisesRegex( UnexpectedToken, "Unexpected token Token.'RULE', 'separated'.", self.lark_parser.parse, g) + self.lark_parser.parse(g) def test_07_term_no_call_rule_lg(self): g = """start: TERM @@ -110,11 +98,12 @@ def test_07_term_no_call_rule_lg(self): self.assertRaisesRegex( GrammarError, "Rules aren't allowed inside terminals", Lark, g) def test_07_term_no_call_rule_ll(self): + # lark.lark allows rules in terminals, and rejects them if you run the LarkValidatorVisitor. g = """start: TERM TERM: rule rule: "a" """ - self.assertRaisesRegex( UnexpectedToken, "Unexpected token Token.'RULE', 'rule'.", self.lark_parser.parse, g) + self.lark_parser.parse(g) def test_08_override_term_lg(self): g = """ @@ -160,5 +149,63 @@ def test_10_rule_modifier_query_bang_ll(self): """ self.lark_parser.parse(g) + def test_lark_validator_alias_top_level_ok(self): + g = """ + start: rule1 + rule1: rule2 -> alias2 + """ + t = self.lark_parser.parse(g) + LarkValidatorVisitor.validate(t) + + def test_lark_validator_alias_inner_bad(self): + g = """ + start: rule1 + rule1: rule2 + | (rule3 -> alias3 | rule4) + rule2: "a" + rule3: "b" + rule4: "c" + """ + + t = self.lark_parser.parse(g) + self.assertRaisesRegex( Exception, "Deep aliasing not allowed", LarkValidatorVisitor.validate, t) + + def test_lark_validator_import_multi_token_bad(self): + g = """ + %ignore A B + start: rule1 + rule1: "c" + A: "a" + B: "b" + """ + t = self.lark_parser.parse(g) + self.assertRaisesRegex(GrammarError, "Bad %ignore - must have a Terminal or other value", LarkValidatorVisitor.validate, t) + + def test_lark_validator_terminal_alias_bad(self): + g = """ + start: rule1 + rule1: TOKEN2 + TOKEN2: "a" -> alias2 + """ + t = self.lark_parser.parse(g) + self.assertRaisesRegex(GrammarError, "Aliasing not allowed in terminals", LarkValidatorVisitor.validate, t) + + def test_lark_validator_terminal_rule_bad(self): + g = """start: TERM + TERM: rule + rule: "a" + """ + t = self.lark_parser.parse(g) + self.assertRaisesRegex(GrammarError, "Rules aren't allowed inside terminals", LarkValidatorVisitor.validate, t) + + def test_lark_validator_terminal_template_bad(self): + g = """start: TERM + separated{x, sep}: x (sep x)* + TERM: separated{"A", " "} + """ + t = self.lark_parser.parse(g) + self.assertRaisesRegex(GrammarError, "Templates not allowed in terminals", LarkValidatorVisitor.validate, t) + + if __name__ == '__main__': main() From 200d6b5f004425174d2364cf701e964c3ca2e2a0 Mon Sep 17 00:00:00 2001 From: Ross Patterson Date: Fri, 9 Feb 2024 18:28:25 -0500 Subject: [PATCH 07/21] Address some review comments. --- docs/grammar.md | 5 +-- lark/grammars/lark.lark | 2 +- tests/test_grammar_formal.py | 69 ++++++++++++++++-------------------- tests/test_lark_lark.py | 4 +-- 4 files changed, 35 insertions(+), 45 deletions(-) diff --git a/docs/grammar.md b/docs/grammar.md index 75fcc5961..2686ca4fa 100644 --- a/docs/grammar.md +++ b/docs/grammar.md @@ -59,7 +59,8 @@ Each item is one of: * `TERMINAL` - Another terminal, which cannot be defined in terms of this terminal. * `"string literal"` - Literal, to be matched as-is. * `"string literal"i` - Literal, to be matched case-insensitively. -* `/regexp literal/` - Regular expression literal. Can inclde flags. +* `/regexp literal/[imslux]` - Regular expression literal. Can include the Python stdlib's `re` [flags `imslux`](https://docs.python.org/3/library/re.html#contents-of-module-re) + * `"character".."character"` - Literal range. The range represends all values between the two literals, inclusively. * `(item item ..)` - Group items * `(item | item | ..)` - Alternate items. @@ -69,7 +70,7 @@ Each item is one of: * `item*` - Zero or more instances of item * `item+` - One or more instances of item * `item ~ n` - Exactly *n* instances of item -* `item ~ n..m` - Between *n* to *m* instances of item (not recommended for wide ranges, due to performance issues) +* `item ~ n..m` - Between *n* to *m* instances of item The EBNF expression in a Lark rule definition is also a sequence of the same set of items to be matched, with one addition: diff --git a/lark/grammars/lark.lark b/lark/grammars/lark.lark index 83b8ad433..b29acc03b 100644 --- a/lark/grammars/lark.lark +++ b/lark/grammars/lark.lark @@ -9,7 +9,7 @@ # from lark.lark_validator_visitor import LarkValidatorVisitor # # lark_path = os.path.join(os.path.dirname(lark.__file__), 'grammars/lark.lark') -# lark_parser = Lark.open(lark_path, parser="lalr") +# lark_parser = Lark.open_from_package("lark", "grammars/lark.lark", parser="lalr") # parse_tree = lark_parser.parse(my_grammar) # LarkValidatorVisitor.validate(parse_tree) diff --git a/tests/test_grammar_formal.py b/tests/test_grammar_formal.py index 6701c49e9..d657a2364 100644 --- a/tests/test_grammar_formal.py +++ b/tests/test_grammar_formal.py @@ -1,6 +1,5 @@ from __future__ import absolute_import -import os from unittest import TestCase, main from lark import lark, Lark, UnexpectedToken @@ -10,86 +9,78 @@ # Based on TestGrammar, with lots of tests that can't be run deleted. class TestGrammarFormal(TestCase): def setUp(self): - lark_path = os.path.join(os.path.dirname(lark.__file__), 'grammars/lark.lark') - with open(lark_path, 'r') as f: - self.lark_grammar = f.read() + self.lark_parser = Lark.open_from_package("lark", "grammars/lark.lark", parser="lalr") def test_errors(self): - l = Lark(self.lark_grammar, parser="lalr") - # This is an unrolled form of the test_grammar.py:GRAMMAR_ERRORS tests, because the lark.lark messages vary. # 'Incorrect type of value', 'a: 1\n' - self.assertRaisesRegex(UnexpectedToken, 'Unexpected token Token..NUMBER., .1..', l.parse, 'a: 1\n') + self.assertRaisesRegex(UnexpectedToken, 'Unexpected token Token..NUMBER., .1..', self.lark_parser.parse, 'a: 1\n') # 'Unclosed parenthesis', 'a: (\n' - self.assertRaisesRegex(UnexpectedToken, 'Unexpected token Token.._NL.,', l.parse, 'a: (\n') + self.assertRaisesRegex(UnexpectedToken, 'Unexpected token Token.._NL.,', self.lark_parser.parse, 'a: (\n') # 'Unmatched closing parenthesis', 'a: )\n' - self.assertRaisesRegex(UnexpectedToken, 'Unexpected token Token..RPAR.', l.parse, 'a: )\n') + self.assertRaisesRegex(UnexpectedToken, 'Unexpected token Token..RPAR.', self.lark_parser.parse, 'a: )\n') # 'Unmatched closing parenthesis', 'a: )\n' - self.assertRaisesRegex(UnexpectedToken, 'Unexpected token Token..RPAR.,', l.parse, 'a: )\n') + self.assertRaisesRegex(UnexpectedToken, 'Unexpected token Token..RPAR.,', self.lark_parser.parse, 'a: )\n') # 'Unmatched closing parenthesis', 'a: (\n' - self.assertRaisesRegex(UnexpectedToken, 'Unexpected token Token.._NL.,', l.parse, 'a: (\n') + self.assertRaisesRegex(UnexpectedToken, 'Unexpected token Token.._NL.,', self.lark_parser.parse, 'a: (\n') # 'Expecting rule or terminal definition (missing colon)', 'a\n' - self.assertRaisesRegex(UnexpectedToken, 'Unexpected token Token.._NL.,', l.parse, 'a\n') + self.assertRaisesRegex(UnexpectedToken, 'Unexpected token Token.._NL.,', self.lark_parser.parse, 'a\n') # 'Expecting rule or terminal definition (missing colon)', 'A\n' - self.assertRaisesRegex(UnexpectedToken, 'Unexpected token Token.._NL.,', l.parse, 'A\n') + self.assertRaisesRegex(UnexpectedToken, 'Unexpected token Token.._NL.,', self.lark_parser.parse, 'A\n') # 'Expecting rule or terminal definition (missing colon)', 'a->\n' - self.assertRaisesRegex(UnexpectedToken, 'Unexpected token Token..__ANON_0., .->', l.parse, 'a->\n') + self.assertRaisesRegex(UnexpectedToken, 'Unexpected token Token..__ANON_0., .->', self.lark_parser.parse, 'a->\n') # 'Expecting rule or terminal definition (missing colon)', 'A->\n' - self.assertRaisesRegex(UnexpectedToken, 'Unexpected token Token..__ANON_0., .->', l.parse, 'A->\n') + self.assertRaisesRegex(UnexpectedToken, 'Unexpected token Token..__ANON_0., .->', self.lark_parser.parse, 'A->\n') # 'Expecting rule or terminal definition (missing colon)', 'a A\n' - self.assertRaisesRegex(UnexpectedToken, 'Unexpected token Token..TOKEN., .A..', l.parse, 'a A\n') + self.assertRaisesRegex(UnexpectedToken, 'Unexpected token Token..TOKEN., .A..', self.lark_parser.parse, 'a A\n') # 'Illegal name for rules or terminals', 'Aa:\n' - self.assertRaisesRegex(UnexpectedToken, 'Unexpected token Token..RULE., .a..', l.parse, 'Aa:\n') + self.assertRaisesRegex(UnexpectedToken, 'Unexpected token Token..RULE., .a..', self.lark_parser.parse, 'Aa:\n') # 'Alias expects lowercase name', 'a: -> "a"\n' - self.assertRaisesRegex(UnexpectedToken, 'Unexpected token Token..STRING., ."a"..', l.parse, 'a: -> "a"\n') + self.assertRaisesRegex(UnexpectedToken, 'Unexpected token Token..STRING., ."a"..', self.lark_parser.parse, 'a: -> "a"\n') # 'Unexpected colon', 'a::\n' - self.assertRaisesRegex(UnexpectedToken, 'Unexpected token Token..COLON.,', l.parse, 'a::\n') + self.assertRaisesRegex(UnexpectedToken, 'Unexpected token Token..COLON.,', self.lark_parser.parse, 'a::\n') # 'Unexpected colon', 'a: b:\n' - self.assertRaisesRegex(UnexpectedToken, 'Unexpected token Token..COLON.,', l.parse, 'a: b:\n') + self.assertRaisesRegex(UnexpectedToken, 'Unexpected token Token..COLON.,', self.lark_parser.parse, 'a: b:\n') # 'Unexpected colon', 'a: B:\n' - self.assertRaisesRegex(UnexpectedToken, 'Unexpected token Token..COLON.,', l.parse, 'a: B:\n') + self.assertRaisesRegex(UnexpectedToken, 'Unexpected token Token..COLON.,', self.lark_parser.parse, 'a: B:\n') # 'Unexpected colon', 'a: "a":\n' - self.assertRaisesRegex(UnexpectedToken, 'Unexpected token Token..COLON.,', l.parse, 'a: "a":\n') + self.assertRaisesRegex(UnexpectedToken, 'Unexpected token Token..COLON.,', self.lark_parser.parse, 'a: "a":\n') # 'Misplaced operator', 'a: b??' - self.assertRaisesRegex(UnexpectedToken, 'Unexpected token Token..OP., .\?..', l.parse, 'a: b??') + self.assertRaisesRegex(UnexpectedToken, 'Unexpected token Token..OP., .\?..', self.lark_parser.parse, 'a: b??') # 'Misplaced operator', 'a: b(?)' - self.assertRaisesRegex(UnexpectedToken, 'Unexpected token Token..OP., .\?..', l.parse, 'a: b(?)') + self.assertRaisesRegex(UnexpectedToken, 'Unexpected token Token..OP., .\?..', self.lark_parser.parse, 'a: b(?)') # 'Misplaced operator', 'a:+\n' - self.assertRaisesRegex(UnexpectedToken, 'Unexpected token Token..OP., .\+..', l.parse, 'a:+\n') + self.assertRaisesRegex(UnexpectedToken, 'Unexpected token Token..OP., .\+..', self.lark_parser.parse, 'a:+\n') # 'Misplaced operator', 'a:?\n' - self.assertRaisesRegex(UnexpectedToken, 'Unexpected token Token..OP., .\?..', l.parse, 'a:?\n') + self.assertRaisesRegex(UnexpectedToken, 'Unexpected token Token..OP., .\?..', self.lark_parser.parse, 'a:?\n') # 'Misplaced operator', 'a:*\n' - self.assertRaisesRegex(UnexpectedToken, 'Unexpected token Token..OP., .\*..', l.parse, 'a:*\n') + self.assertRaisesRegex(UnexpectedToken, 'Unexpected token Token..OP., .\*..', self.lark_parser.parse, 'a:*\n') # 'Misplaced operator', 'a:|*\n' - self.assertRaisesRegex(UnexpectedToken, 'Unexpected token Token..OP., .\*..', l.parse, 'a:|*\n') + self.assertRaisesRegex(UnexpectedToken, 'Unexpected token Token..OP., .\*..', self.lark_parser.parse, 'a:|*\n') # 'Expecting option ("|") or a new rule or terminal definition', 'a:a\n()\n' - self.assertRaisesRegex(UnexpectedToken, 'Unexpected token Token..LPAR.,', l.parse, 'a:a\n()\n') + self.assertRaisesRegex(UnexpectedToken, 'Unexpected token Token..LPAR.,', self.lark_parser.parse, 'a:a\n()\n') # 'Terminal names cannot contain dots', 'A.B\n' - self.assertRaisesRegex(UnexpectedToken, 'Unexpected token Token..TOKEN., .B..', l.parse, 'A.B\n') + self.assertRaisesRegex(UnexpectedToken, 'Unexpected token Token..TOKEN., .B..', self.lark_parser.parse, 'A.B\n') # 'Expecting rule or terminal definition', '"a"\n' - self.assertRaisesRegex(UnexpectedToken, 'Unexpected token Token..STRING., ."a"..', l.parse, '"a"\n') + self.assertRaisesRegex(UnexpectedToken, 'Unexpected token Token..STRING., ."a"..', self.lark_parser.parse, '"a"\n') # '%import expects a name', '%import "a"\n' - self.assertRaisesRegex(UnexpectedToken, 'Unexpected token Token..STRING., ."a"..', l.parse, '%import "a"\n') + self.assertRaisesRegex(UnexpectedToken, 'Unexpected token Token..STRING., ."a"..', self.lark_parser.parse, '%import "a"\n') # '%ignore expects a value', '%ignore %import\n' - self.assertRaisesRegex(UnexpectedToken, 'Unexpected token Token..__ANON_2., .%import..', l.parse, '%ignore %import\n') + self.assertRaisesRegex(UnexpectedToken, 'Unexpected token Token..__ANON_2., .%import..', self.lark_parser.parse, '%ignore %import\n') def test_alias_in_terminal(self): - l = Lark(self.lark_grammar, parser="lalr") g = """start: TERM TERM: "a" -> alias """ - # self.assertRaisesRegex( GrammarError, "Aliasing not allowed in terminals", Lark, g) - self.assertRaisesRegex( UnexpectedToken, "Unexpected token Token.'__ANON_0', '->'.", l.parse, g) + self.assertRaisesRegex( UnexpectedToken, "Unexpected token Token.'__ANON_0', '->'.", self.lark_parser.parse, g) def test_inline_with_expand_single(self): - l = Lark(self.lark_grammar, parser="lalr") grammar = r""" start: _a !?_a: "A" """ - # self.assertRaisesRegex(GrammarError, "Inlined rules (_rule) cannot use the ?rule modifier.", l.parse, grammar) - self.assertRaisesRegex(UnexpectedToken, "Unexpected token Token.'OP', '?'.", l.parse, grammar) + self.assertRaisesRegex(UnexpectedToken, "Unexpected token Token.'OP', '?'.", self.lark_parser.parse, grammar) if __name__ == '__main__': diff --git a/tests/test_lark_lark.py b/tests/test_lark_lark.py index 4f3d1236f..8281136dc 100644 --- a/tests/test_lark_lark.py +++ b/tests/test_lark_lark.py @@ -1,6 +1,5 @@ from __future__ import absolute_import -import os from unittest import TestCase, main from lark import lark, Lark, UnexpectedToken @@ -12,8 +11,7 @@ # grammars/lark.lark have been resolved. class TestLarkLark(TestCase): def setUp(self): - lark_path = os.path.join(os.path.dirname(lark.__file__), 'grammars/lark.lark') - self.lark_parser = Lark.open(lark_path, parser="lalr") + self.lark_parser = Lark.open_from_package("lark", "grammars/lark.lark", parser="lalr") def test_01_no_alias_in_terminal_lg(self): g = """start: TERM From 0fb28f9ad16b461821227f2dbfa71a01d84cf363 Mon Sep 17 00:00:00 2001 From: Ross Patterson Date: Fri, 9 Feb 2024 19:12:53 -0500 Subject: [PATCH 08/21] Fix review comment re: templates in terminals. --- lark/load_grammar.py | 3 +++ tests/test_lark_lark.py | 2 +- 2 files changed, 4 insertions(+), 1 deletion(-) diff --git a/lark/load_grammar.py b/lark/load_grammar.py index 362a845d2..b7bff233e 100644 --- a/lark/load_grammar.py +++ b/lark/load_grammar.py @@ -660,6 +660,9 @@ def maybe(self, expr): def alias(self, t): raise GrammarError("Aliasing not allowed in terminals (You used -> in the wrong place)") + def template_usage(self, t): + raise GrammarError("Templates not allowed in terminals") + def value(self, v): return v[0] diff --git a/tests/test_lark_lark.py b/tests/test_lark_lark.py index 8281136dc..b7a990e7c 100644 --- a/tests/test_lark_lark.py +++ b/tests/test_lark_lark.py @@ -78,7 +78,7 @@ def test_06_no_term_templates_lg(self): separated{x, sep}: x (sep x)* TERM: separated{"A", " "} """ - self.assertRaisesRegex( AssertionError, "Tree.'template_usage', .NonTerminal.'separated'.", Lark, g) + self.assertRaisesRegex( GrammarError, "Templates not allowed in terminals", Lark, g) def test_06_no_term_templates_ll(self): # lark.lark allows templates in terminals, and rejects them if you run the LarkValidatorVisitor. From 2ec5ef3dd5a0eeeea479fc7f4c8e76e2fb22e0ff Mon Sep 17 00:00:00 2001 From: Ross Patterson Date: Fri, 9 Feb 2024 19:27:20 -0500 Subject: [PATCH 09/21] Fix review comment: Remove inlining from expansions, expansion, and value in lark.lark' --- lark/grammars/lark.lark | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/lark/grammars/lark.lark b/lark/grammars/lark.lark index b29acc03b..86cfb1942 100644 --- a/lark/grammars/lark.lark +++ b/lark/grammars/lark.lark @@ -38,11 +38,11 @@ statement: "%ignore" expansions -> ignore !import_path: "."? name ("." name)* name_list: "(" name ("," name)* ")" -?expansions: alias (_VBAR alias)* +expansions: alias (_VBAR alias)* ?alias: expansion ["->" RULE] -?expansion: expr* +expansion: expr* ?expr: atom [OP | "~" NUMBER [".." NUMBER]] @@ -50,7 +50,7 @@ name_list: "(" name ("," name)* ")" | "[" expansions "]" -> maybe | value -?value: STRING ".." STRING -> literal_range +value: STRING ".." STRING -> literal_range | name | (REGEXP | STRING) -> literal | RULE "{" value ("," value)* "}" -> template_usage From e9c026eecdc69a0c030a10184bfd61f1d9e7e04c Mon Sep 17 00:00:00 2001 From: Ross Patterson Date: Sat, 10 Feb 2024 13:10:33 -0500 Subject: [PATCH 10/21] Address review comment: Make alias and expr optionals, not maybes, so they can be inlined. --- lark/grammars/lark.lark | 4 ++-- lark/lark_validator_visitor.py | 24 ++++++++++++++++++++---- 2 files changed, 22 insertions(+), 6 deletions(-) diff --git a/lark/grammars/lark.lark b/lark/grammars/lark.lark index 86cfb1942..783308e93 100644 --- a/lark/grammars/lark.lark +++ b/lark/grammars/lark.lark @@ -40,11 +40,11 @@ name_list: "(" name ("," name)* ")" expansions: alias (_VBAR alias)* -?alias: expansion ["->" RULE] +?alias: expansion ("->" RULE)? expansion: expr* -?expr: atom [OP | "~" NUMBER [".." NUMBER]] +?expr: atom (OP | "~" NUMBER (".." NUMBER)?)? ?atom: "(" expansions ")" | "[" expansions "]" -> maybe diff --git a/lark/lark_validator_visitor.py b/lark/lark_validator_visitor.py index 2e3ee82c9..0409bdb34 100644 --- a/lark/lark_validator_visitor.py +++ b/lark/lark_validator_visitor.py @@ -11,10 +11,6 @@ def validate(cls, tree: Tree): visitor.visit(tree) return tree - def alias(self, tree: Tree): - # Reject alias names in inner 'expansions'. - self._reject_aliases(tree.children[0], "Deep aliasing not allowed") - def ignore(self, tree: Tree): # Reject everything except 'literal' and 'name' > 'TOKEN'. assert len(tree.children) > 0 # The grammar should pass us some things to ignore. @@ -54,6 +50,19 @@ def ignore(self, tree: Tree): return self._reject_bad_ignore() + def rule(self, tree: Tree): + assert len(tree.children) > 2 # The grammar should pass us at least rule name, rule params, and an item. + first_item = 3 if tree.children[1].data == "priority" else 2 + # Reject alias names in rule definitions below the top level. + node = tree.children[first_item] + if node.data == "expansions": + for child in node.children: + for grandchild in child.children: + self._reject_deep_aliases(grandchild) + elif node.data == "alias": + for child in node.children: + self._reject_deep_aliases(child) + def token(self, tree: Tree): assert len(tree.children) > 1 # The grammar should pass us at least a token name and an item. first_item = 2 if tree.children[1].data == "priority" else 1 @@ -68,6 +77,13 @@ def token(self, tree: Tree): for child in tree.children[first_item:]: self._reject_rules(child, "Rules aren't allowed inside terminals") + def _reject_deep_aliases(self, item: Tree|Token): + if isinstance(item, Tree): + if item.data == "alias" and len(item.children) > 1 and item.children[1] is not None: + raise GrammarError("Deep aliasing not allowed") + for child in item.children: + self._reject_deep_aliases(child) + def _reject_aliases(self, item: Tree|Token, message: str): if isinstance(item, Tree): if item.data == "alias" and len(item.children) > 1 and item.children[1] is not None: From 9bf7ddf466979c2f61d6e67d9c4b1e19b3f65e0d Mon Sep 17 00:00:00 2001 From: Ross Patterson Date: Sat, 10 Feb 2024 17:30:39 -0500 Subject: [PATCH 11/21] Address review comment: Make '%declare rule' fail in post-processing with a nice message. --- lark/lark_validator_visitor.py | 8 ++++++++ lark/load_grammar.py | 7 ++++--- tests/test_grammar.py | 35 +++++++++++++++++++++++++++++++++- 3 files changed, 46 insertions(+), 4 deletions(-) diff --git a/lark/lark_validator_visitor.py b/lark/lark_validator_visitor.py index 0409bdb34..6e9a2a51a 100644 --- a/lark/lark_validator_visitor.py +++ b/lark/lark_validator_visitor.py @@ -11,6 +11,14 @@ def validate(cls, tree: Tree): visitor.visit(tree) return tree + def declare(self, tree: Tree): + for child in tree.children: + assert child.data == "name" + assert len(child.children) == 1 + assert isinstance(child.children[0], Token) + if child.children[0].type != "TOKEN": + raise GrammarError("Expecting terminal name") + def ignore(self, tree: Tree): # Reject everything except 'literal' and 'name' > 'TOKEN'. assert len(tree.children) > 0 # The grammar should pass us some things to ignore. diff --git a/lark/load_grammar.py b/lark/load_grammar.py index b7bff233e..fd85963de 100644 --- a/lark/load_grammar.py +++ b/lark/load_grammar.py @@ -1272,13 +1272,14 @@ def load_grammar(self, grammar_text: str, grammar_name: str="", mangle: Optio self._ignore(*stmt.children) elif stmt.data == 'declare': for symbol in stmt.children: - assert isinstance(symbol, Symbol), symbol - is_term = isinstance(symbol, Terminal) + if isinstance(symbol, NonTerminal): + raise GrammarError("Expecting terminal name") + assert isinstance(symbol, Terminal), symbol if mangle is None: name = symbol.name else: name = mangle(symbol.name) - self._define(name, is_term, None) + self._define(name, True, None) elif stmt.data == 'import': pass else: diff --git a/tests/test_grammar.py b/tests/test_grammar.py index 624b0799a..6ac7b5e30 100644 --- a/tests/test_grammar.py +++ b/tests/test_grammar.py @@ -6,6 +6,7 @@ from lark import Lark, Token, Tree, ParseError, UnexpectedInput from lark.load_grammar import GrammarError, GRAMMAR_ERRORS, find_grammar_errors, list_grammar_imports from lark.load_grammar import FromPackageLoader +from lark.lark_validator_visitor import LarkValidatorVisitor class TestGrammar(TestCase): @@ -295,9 +296,41 @@ def test_line_breaks(self): """) p.parse('ab') + def test_declare_rule_lg(self): + g = """ + %declare a + start: b + b: "c" + """ + self.assertRaisesRegex(GrammarError, "Expecting terminal name", Lark, g) + def test_declare_rule_ll(self): + g = """ + %declare a + start: b + b: "c" + """ + l = Lark.open_from_package("lark", "grammars/lark.lark") + t = l.parse(g) + self.assertRaisesRegex(GrammarError, "Expecting terminal name", LarkValidatorVisitor.validate, t) + + def test_declare_token_lg(self): + g = """ + %declare A + start: b + b: "c" + """ + Lark(g) - + def test_declare_token_ll(self): + g = """ + %declare A + start: b + b: "c" + """ + l = Lark.open_from_package("lark", "grammars/lark.lark") + t = l.parse(g) + LarkValidatorVisitor.validate(t) if __name__ == '__main__': From 7f02bd130b40b46fcd2709986e9e2a855c82cc9d Mon Sep 17 00:00:00 2001 From: Ross Patterson Date: Tue, 13 Feb 2024 15:23:39 -0500 Subject: [PATCH 12/21] lark.lark doesn't allow backslash-nl as a line-continuation, but load_grammar does. --- lark/grammars/lark.lark | 2 ++ 1 file changed, 2 insertions(+) diff --git a/lark/grammars/lark.lark b/lark/grammars/lark.lark index 783308e93..78d81c5fa 100644 --- a/lark/grammars/lark.lark +++ b/lark/grammars/lark.lark @@ -66,6 +66,7 @@ TOKEN: /_?[A-Z][_A-Z0-9]*/ STRING: _STRING "i"? REGEXP: /\/(?!\/)(\\\/|\\\\|[^\/])*?\/[imslux]*/ _NL: /(\r?\n)+\s*/ +BACKSLASH: /\\[ ]*\n/ %import common.ESCAPED_STRING -> _STRING %import common.SIGNED_INT -> NUMBER @@ -75,3 +76,4 @@ COMMENT: /\s*/ "//" /[^\n]/* | /\s*/ "#" /[^\n]/* %ignore WS_INLINE %ignore COMMENT +%ignore BACKSLASH From 4f7a5ebacaf3afe679ba552aba76a7a6a722b68a Mon Sep 17 00:00:00 2001 From: Ross Patterson Date: Thu, 14 Mar 2024 21:27:09 -0400 Subject: [PATCH 13/21] Push optionality of rule_modifiers and priority down into rule_modifiers and priority, like load_grammar does --- lark/grammars/lark.lark | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/lark/grammars/lark.lark b/lark/grammars/lark.lark index 78d81c5fa..131d9e857 100644 --- a/lark/grammars/lark.lark +++ b/lark/grammars/lark.lark @@ -19,14 +19,14 @@ _item: rule | token | statement -rule: rule_modifiers? RULE rule_params priority? ":" expansions +rule: rule_modifiers RULE rule_params priority ":" expansions token: TOKEN priority? ":" expansions -rule_modifiers: RULE_MODIFIERS +rule_modifiers: RULE_MODIFIERS? rule_params: ["{" RULE ("," RULE)* "}"] -priority: "." NUMBER +priority: ("." NUMBER)? statement: "%ignore" expansions -> ignore | "%import" import_path ["->" name] -> import From 40576d2383f57b58a25d303b599bfcc54cbe23dc Mon Sep 17 00:00:00 2001 From: Ross Patterson Date: Fri, 15 Mar 2024 14:30:04 -0400 Subject: [PATCH 14/21] Fix bug introduced in #1018 --- lark/load_grammar.py | 27 ++++++++++++++------------- 1 file changed, 14 insertions(+), 13 deletions(-) diff --git a/lark/load_grammar.py b/lark/load_grammar.py index fd85963de..a35005865 100644 --- a/lark/load_grammar.py +++ b/lark/load_grammar.py @@ -1102,9 +1102,10 @@ def __init__(self, global_keep_all_tokens: bool=False, import_paths: Optional[Li self._definitions: Dict[str, Definition] = {} self._ignore_names: List[str] = [] - def _grammar_error(self, is_term, msg, *names): + def _grammar_error(self, msg, *subs): args = {} - for i, name in enumerate(names, start=1): + for i, sub in enumerate(subs, start=1): + name, is_term = sub postfix = '' if i == 1 else str(i) args['name' + postfix] = name args['type' + postfix] = lowercase_type = ("rule", "terminal")[is_term] @@ -1130,28 +1131,28 @@ def _check_options(self, is_term, options): def _define(self, name, is_term, exp, params=(), options=None, *, override=False): if name in self._definitions: if not override: - self._grammar_error(is_term, "{Type} '{name}' defined more than once", name) + self._grammar_error("{Type} '{name}' defined more than once", (name, is_term)) elif override: - self._grammar_error(is_term, "Cannot override a nonexisting {type} {name}", name) + self._grammar_error("Cannot override a nonexisting {type} {name}", (name, is_term)) if name.startswith('__'): - self._grammar_error(is_term, 'Names starting with double-underscore are reserved (Error at {name})', name) + self._grammar_error('Names starting with double-underscore are reserved (Error at {name})', (name, is_term)) self._definitions[name] = Definition(is_term, exp, params, self._check_options(is_term, options)) def _extend(self, name, is_term, exp, params=(), options=None): if name not in self._definitions: - self._grammar_error(is_term, "Can't extend {type} {name} as it wasn't defined before", name) + self._grammar_error("Can't extend {type} {name} as it wasn't defined before", (name, is_term)) d = self._definitions[name] if is_term != d.is_term: - self._grammar_error(is_term, "Cannot extend {type} {name} - one is a terminal, while the other is not.", name) + self._grammar_error("Cannot extend {type} {name} - one is a terminal, while the other is not.", (name, is_term)) if tuple(params) != d.params: - self._grammar_error(is_term, "Cannot extend {type} with different parameters: {name}", name) + self._grammar_error("Cannot extend {type} with different parameters: {name}", (name, is_term)) if d.tree is None: - self._grammar_error(is_term, "Can't extend {type} {name} - it is abstract.", name) + self._grammar_error("Can't extend {type} {name} - it is abstract.", (name, is_term)) # TODO: think about what to do with 'options' base = d.tree @@ -1362,15 +1363,15 @@ def validate(self) -> None: args = temp.children[1:] if sym not in params: if sym not in self._definitions: - self._grammar_error(d.is_term, "Template '%s' used but not defined (in {type} {name})" % sym, name) + self._grammar_error("Template '%s' used but not defined (in {type} {name})" % sym, (name, d.is_term)) if len(args) != len(self._definitions[sym].params): expected, actual = len(self._definitions[sym].params), len(args) - self._grammar_error(d.is_term, "Wrong number of template arguments used for {name} " - "(expected %s, got %s) (in {type2} {name2})" % (expected, actual), sym, name) + self._grammar_error("Wrong number of template arguments used for {name} " + "(expected %s, got %s) (in {type2} {name2})" % (expected, actual), (sym, sym.isupper()), (name, d.is_term)) for sym in _find_used_symbols(exp): if sym not in self._definitions and sym not in params: - self._grammar_error(d.is_term, "{Type} '{name}' used but not defined (in {type2} {name2})", sym, name) + self._grammar_error("{Type} '{name}' used but not defined (in {type2} {name2})", (sym, sym.isupper()), (name, d.is_term)) if not set(self._definitions).issuperset(self._ignore_names): raise GrammarError("Terminals %s were marked to ignore but were not defined!" % (set(self._ignore_names) - set(self._definitions))) From daac65d84b1d61cf4ff3e5eb00ce99f1b5a5ae02 Mon Sep 17 00:00:00 2001 From: Ross Patterson Date: Fri, 15 Mar 2024 15:25:35 -0400 Subject: [PATCH 15/21] Issue #1388 is ready for review. --- lark/grammars/lark.lark | 10 +- lark/lark_validator.py | 265 +++++++++++++ lark/lark_validator_visitor.py | 117 ------ lark/load_grammar.py | 5 +- tests/__main__.py | 5 +- tests/test_grammar.py | 692 +++++++++++++++++++-------------- tests/test_grammar_formal.py | 87 ----- tests/test_ignore.py | 42 -- tests/test_lark_lark.py | 209 ---------- tests/test_lark_validator.py | 12 + 10 files changed, 682 insertions(+), 762 deletions(-) create mode 100644 lark/lark_validator.py delete mode 100644 lark/lark_validator_visitor.py delete mode 100644 tests/test_grammar_formal.py delete mode 100644 tests/test_ignore.py delete mode 100644 tests/test_lark_lark.py create mode 100644 tests/test_lark_validator.py diff --git a/lark/grammars/lark.lark b/lark/grammars/lark.lark index 131d9e857..6c7d0ed4d 100644 --- a/lark/grammars/lark.lark +++ b/lark/grammars/lark.lark @@ -1,17 +1,15 @@ # Lark grammar of Lark's syntax # Note: Lark is not bootstrapped, its parser is implemented in load_grammar.py # This grammar matches that one, but does not enfore some rules that it does. -# If you want to enforce those, you can pass the "LarkValidatorVisitor" over +# If you want to enforce those, you can pass the "LarkValidator" over # the parse tree, like this: -# import os -# import lark -# from lark.lark_validator_visitor import LarkValidatorVisitor +# from lark import Lark +# from lark.lark_validator import LarkValidator # -# lark_path = os.path.join(os.path.dirname(lark.__file__), 'grammars/lark.lark') # lark_parser = Lark.open_from_package("lark", "grammars/lark.lark", parser="lalr") # parse_tree = lark_parser.parse(my_grammar) -# LarkValidatorVisitor.validate(parse_tree) +# LarkValidator.validate(parse_tree) start: (_item? _NL)* _item? diff --git a/lark/lark_validator.py b/lark/lark_validator.py new file mode 100644 index 000000000..76d9ea3d7 --- /dev/null +++ b/lark/lark_validator.py @@ -0,0 +1,265 @@ +from typing import Any, Dict, List + +from .exceptions import GrammarError +from .grammar import TOKEN_DEFAULT_PRIORITY, RuleOptions +from .lexer import Token +from .load_grammar import eval_escaping +from .tree import Tree + +class Definition: + def __init__(self, is_term, tree, params=(), options=None): + self.is_term = is_term + self.tree = tree + self.params = tuple(params) + +class LarkValidator: + + @classmethod + def validate(cls, tree: Tree, options: Dict[str, Any] = {}): + visitor = cls(tree, options) + visitor._cross_check_symbols() + visitor._resolve_term_references() + visitor._check_literals(tree) + return tree + + def __init__(self, tree: Tree, options: Dict[str, Any]): + self._definitions: Dict[str, Definition] = {} + self._ignore_names: List[str] = [] + self._load_grammar(tree) + + def _check_literals(self, tree: Tree) -> None: + for literal in tree.find_data("literal"): + self._literal(literal) + + def _cross_check_symbols(self) -> None: + # Based on load_grammar.GrammarBuilder.validate() + for name, d in self._definitions.items(): + params = d.params + definition = d.tree + for i, p in enumerate(params): + if p in self._definitions: + raise GrammarError("Template Parameter conflicts with rule %s (in template %s)" % (p, name)) + if p in params[:i]: + raise GrammarError("Duplicate Template Parameter %s (in template %s)" % (p, name)) + # Remaining checks don't apply to abstract rules/terminals (i.e., created with %declare) + if definition and isinstance(definition, Tree): + for template in definition.find_data('template_usage'): + if d.is_term: + raise GrammarError("Templates not allowed in terminals") + sym = template.children[0].data + args = template.children[1:] + if sym not in params: + if sym not in self._definitions: + raise GrammarError(f"Template '{sym}' used but not defined (in {('rule', 'terminal')[d.is_term]} {name})") + if len(args) != len(self._definitions[sym].params): + expected, actual = len(self._definitions[sym].params), len(args) + raise GrammarError(f"Wrong number of template arguments used for {expected} " + f"(expected {expected}, got {actual}) (in {('rule', 'terminal')[d.is_term]} {actual})") + for sym in _find_used_symbols(definition): + if sym not in self._definitions and sym not in params: + raise GrammarError(f"{('Rule', 'Terminal')[sym.isupper()]} '{sym}' used but not defined (in {('rule', 'terminal')[d.is_term]} { name})") + if not set(self._definitions).issuperset(self._ignore_names): + raise GrammarError("Terminals %s were marked to ignore but were not defined!" % (set(self._ignore_names) - set(self._definitions))) + + def _declare(self, stmt: Tree) -> None: + for symbol in stmt.children: + if isinstance(symbol, Tree) and symbol.data == 'name': + symbol = symbol.children[0] + if not isinstance(symbol, Token) or symbol.type != "TOKEN": + raise GrammarError("Expecting terminal name") + self._define(symbol.value, True, None) + + def _define(self, name: str, is_term: bool, exp: "Tree|None", params: List[str] = [], options:Any = None, *, override: bool = False, extend: bool = False) -> None: + # Based on load_grammar.GrammarBuilder._define() + if name in self._definitions: + if not override and not extend: + raise GrammarError(f"{('Rule', 'Terminal')[is_term]} '{name}' defined more than once") + if extend: + base_def = self._definitions[name] + if is_term != base_def.is_term: + raise GrammarError("fCannot extend {('rule', 'terminal')[is_term]} {name} - one is a terminal, while the other is not.") + if tuple(params) != base_def.params: + raise GrammarError(f"Cannot extend {('rule', 'terminal')[is_term]} with different parameters: {name}") + if base_def.tree is None: + raise GrammarError(f"Can't extend {('rule', 'terminal')[is_term]} {name} - it is abstract.") + if name.startswith('__'): + raise GrammarError(f'Names starting with double-underscore are reserved (Error at {name})') + if is_term: + if options and not isinstance(options, int): + raise GrammarError(f"Terminal require a single int as 'options' (e.g. priority), got {type(options)}") + else: + if options and not isinstance(options, RuleOptions): + raise GrammarError("Rules require a RuleOptions instance as 'options'") + self._definitions[name] = Definition(is_term, exp, params) + + def _extend(self, stmt: Tree) -> None: + definition = stmt.children[0] + if definition.data == 'token': + name = definition.children[0] + if name not in self._definitions: + raise GrammarError(f"Can't extend terminal {name} as it wasn't defined before") + self._token(definition, extend=True) + else: # definition.data == 'rule' + name = definition.children[1] + if name not in self._definitions: + raise GrammarError(f"Can't extend rule {name} as it wasn't defined before") + self._rule(definition, extend=True) + + def _ignore(self, stmt: Tree) -> None: + # Children: expansions + # - or - + # Children: token + exp_or_name = stmt.children[0] + if isinstance(exp_or_name, str): + self._ignore_names.append(exp_or_name) + else: + assert isinstance(exp_or_name, Tree) + t = exp_or_name + if t.data == 'expansions' and len(t.children) == 1: + t2 ,= t.children + if t2.data=='expansion': + if len(t2.children) > 1: + raise GrammarError("Bad %ignore - must have a Terminal or other value.") + item ,= t2.children + if item.data == 'value': + item ,= item.children + if isinstance(item, Token): + # Keep terminal name, no need to create a new definition + self._ignore_names.append(item.value) + return + if item.data == 'name': + token ,= item.children + if isinstance(token, Token) and token.type == "TOKEN": + # Keep terminal name, no need to create a new definition + self._ignore_names.append(token.value) + return + name = '__IGNORE_%d'% len(self._ignore_names) + self._ignore_names.append(name) + self._definitions[name] = Definition(True, t, options=TOKEN_DEFAULT_PRIORITY) + + def _literal(self, tree: Tree) -> None: + # Based on load_grammar.GrammarBuilder.literal_to_pattern(). + assert tree.data == 'literal' + literal = tree.children[0] + assert isinstance(literal, Token) + v = literal.value + flag_start = max(v.rfind('/'), v.rfind('"'))+1 + assert flag_start > 0 + flags = v[flag_start:] + if literal.type == 'STRING' and '\n' in v: + raise GrammarError('You cannot put newlines in string literals') + if literal.type == 'REGEXP' and '\n' in v and 'x' not in flags: + raise GrammarError('You can only use newlines in regular expressions ' + 'with the `x` (verbose) flag') + v = v[:flag_start] + assert v[0] == v[-1] and v[0] in '"/' + x = v[1:-1] + s = eval_escaping(x) + if s == "": + raise GrammarError("Empty terminals are not allowed (%s)" % literal) + + def _load_grammar(self, tree: Tree) -> None: + for stmt in tree.children: + if stmt.data == 'declare': + self._declare(stmt) + elif stmt.data == 'extend': + self._extend(stmt) + elif stmt.data == 'ignore': + self._ignore(stmt) + elif stmt.data in ['import', 'multi_import']: + # TODO How can we process imports in the validator? + pass + elif stmt.data == 'override': + self._override(stmt) + elif stmt.data == 'rule': + self._rule(stmt) + elif stmt.data == 'token': + self._token(stmt) + else: + assert False, f"Unknown statement type: {stmt}" + + def _override(self, stmt: Tree) -> None: + definition = stmt.children[0] + if definition.data == 'token': + name = definition.children[0] + if name not in self._definitions: + raise GrammarError(f"Cannot override a nonexisting terminal {name}") + self._token(definition, override=True) + else: # definition.data == 'rule' + name = definition.children[1] + if name not in self._definitions: + raise GrammarError(f"Cannot override a nonexisting rule {name}") + self._rule(definition, override=True) + + def _resolve_term_references(self) -> None: + # Based on load_grammar.resolve_term_references() + # and the bottom of load_grammar.GrammarBuilder.load_grammar() + term_dict = { name: d.tree + for name, d in self._definitions.items() + if d.is_term + } + while True: + changed = False + for name, token_tree in term_dict.items(): + if token_tree is None: # Terminal added through %declare + continue + for exp in token_tree.find_data('value'): + item ,= exp.children + if isinstance(item, Tree) and item.data == 'name' and isinstance(item.children[0], Token) and item.children[0].type == 'RULE' : + raise GrammarError("Rules aren't allowed inside terminals (%s in %s)" % (item, name)) + elif isinstance(item, Token): + try: + term_value = term_dict[item.value] + except KeyError: + raise GrammarError("Terminal used but not defined: %s" % item.value) + assert term_value is not None + exp.children[0] = term_value + changed = True + else: + assert isinstance(item, Tree) + if not changed: + break + + for name, term in term_dict.items(): + if term: # Not just declared + for child in term.children: + ids = [id(x) for x in child.iter_subtrees()] + if id(term) in ids: + raise GrammarError("Recursion in terminal '%s' (recursion is only allowed in rules, not terminals)" % name) + + def _rule(self, tree, override=False, extend=False) -> None: + # Children: modifiers, name, params, priority, expansions + name = tree.children[1] + if tree.children[0].data == "rule_modifiers" and tree.children[0].children: + modifiers = tree.children[0].children[0] + if '?' in modifiers and name.startswith('_'): + raise GrammarError("Inlined rules (_rule) cannot use the ?rule modifier.") + if tree.children[2].children[0] is not None: + params = [t.value for t in tree.children[2].children] # For the grammar parser + else: + params = [] + self._define(name, False, tree.children[4], params=params, override=override, extend=extend) + + def _token(self, tree, override=False, extend=False) -> None: + # Children: name, priority, expansions + # - or - + # Children: name, expansions + if tree.children[1].data == "priority" and tree.children[1].children: + opts = int(tree.children[1].children[0]) # priority + else: + opts = TOKEN_DEFAULT_PRIORITY + for item in tree.children[-1].find_data('alias'): + raise GrammarError("Aliasing not allowed in terminals (You used -> in the wrong place)") + self._define(tree.children[0].value, True, tree.children[-1], [], opts, override=override, extend=extend) + +def _find_used_symbols(tree) -> List[str]: + # Based on load_grammar.GrammarBuilder._find_used_symbols() + assert tree.data == 'expansions' + results = [] + for expansion in tree.find_data('expansion'): + for item in expansion.scan_values(lambda t: True): + if isinstance(item, Tree) and item.data == 'name': + results.append(item.data) + elif isinstance(item, Token) and item.type not in ['NUMBER', 'OP', 'STRING', 'REGEXP']: + results.append(item.value) + return results diff --git a/lark/lark_validator_visitor.py b/lark/lark_validator_visitor.py deleted file mode 100644 index 6e9a2a51a..000000000 --- a/lark/lark_validator_visitor.py +++ /dev/null @@ -1,117 +0,0 @@ -from .lexer import Token -from .load_grammar import GrammarError -from .visitors import Visitor -from .tree import Tree - -class LarkValidatorVisitor(Visitor): - - @classmethod - def validate(cls, tree: Tree): - visitor = cls() - visitor.visit(tree) - return tree - - def declare(self, tree: Tree): - for child in tree.children: - assert child.data == "name" - assert len(child.children) == 1 - assert isinstance(child.children[0], Token) - if child.children[0].type != "TOKEN": - raise GrammarError("Expecting terminal name") - - def ignore(self, tree: Tree): - # Reject everything except 'literal' and 'name' > 'TOKEN'. - assert len(tree.children) > 0 # The grammar should pass us some things to ignore. - if len(tree.children) > 1: - self._reject_bad_ignore() - node = tree.children[0] - if node.data == "expansions": - if len(node.children) > 1: - self._reject_bad_ignore() - node = node.children[0] - if node.data == "alias": - if len(node.children) > 1: - self._reject_bad_ignore() - node = node.children[0] - if node.data == "expansion": - if len(node.children) > 1: - self._reject_bad_ignore() - node = node.children[0] - if node.data == "expr": - if len(node.children) > 1: - self._reject_bad_ignore() - node = node.children[0] - if node.data == "atom": - if len(node.children) > 1: - self._reject_bad_ignore() - node = node.children[0] - if node.data == "literal": - return - elif node.data == "name": - if node.children[0].data == "TOKEN": - return - elif node.data == "value": - if node.children[0].data == "literal": - return - elif node.children[0].data == "name": - if node.children[0][0].data == "TOKEN": - return - self._reject_bad_ignore() - - def rule(self, tree: Tree): - assert len(tree.children) > 2 # The grammar should pass us at least rule name, rule params, and an item. - first_item = 3 if tree.children[1].data == "priority" else 2 - # Reject alias names in rule definitions below the top level. - node = tree.children[first_item] - if node.data == "expansions": - for child in node.children: - for grandchild in child.children: - self._reject_deep_aliases(grandchild) - elif node.data == "alias": - for child in node.children: - self._reject_deep_aliases(child) - - def token(self, tree: Tree): - assert len(tree.children) > 1 # The grammar should pass us at least a token name and an item. - first_item = 2 if tree.children[1].data == "priority" else 1 - # Reject alias names in token definitions. - for child in tree.children[first_item:]: - self._reject_aliases(child, "Aliasing not allowed in terminals (You used -> in the wrong place)") - # Reject template usage in token definitions. We do this before checking rules - # because rule usage looks like template usage, just without parameters. - for child in tree.children[first_item:]: - self._reject_templates(child, "Templates not allowed in terminals") - # Reject rule references in token definitions. - for child in tree.children[first_item:]: - self._reject_rules(child, "Rules aren't allowed inside terminals") - - def _reject_deep_aliases(self, item: Tree|Token): - if isinstance(item, Tree): - if item.data == "alias" and len(item.children) > 1 and item.children[1] is not None: - raise GrammarError("Deep aliasing not allowed") - for child in item.children: - self._reject_deep_aliases(child) - - def _reject_aliases(self, item: Tree|Token, message: str): - if isinstance(item, Tree): - if item.data == "alias" and len(item.children) > 1 and item.children[1] is not None: - raise GrammarError(message) - for child in item.children: - self._reject_aliases(child, message) - - def _reject_bad_ignore(self): - raise GrammarError("Bad %ignore - must have a Terminal or other value.") - - def _reject_rules(self, item: Tree|Token, message: str): - if isinstance(item, Token) and item.type == "RULE": - raise GrammarError(message) - elif isinstance(item, Tree): - for child in item.children: - self._reject_rules(child, message) - - def _reject_templates(self, item: Tree|Token, message: str): - if isinstance(item, Tree): - if item.data == "template_usage": - raise GrammarError(message) - for child in item.children: - self._reject_templates(child, message) diff --git a/lark/load_grammar.py b/lark/load_grammar.py index a35005865..08531391b 100644 --- a/lark/load_grammar.py +++ b/lark/load_grammar.py @@ -1168,7 +1168,9 @@ def _ignore(self, exp_or_name): t = exp_or_name if t.data == 'expansions' and len(t.children) == 1: t2 ,= t.children - if t2.data=='expansion' and len(t2.children) == 1: + if t2.data=='expansion': + if len(t2.children) > 1: + raise GrammarError("Bad %ignore - must have a Terminal or other value.") item ,= t2.children if item.data == 'value': item ,= item.children @@ -1242,7 +1244,6 @@ def _unpack_definition(self, tree, mangle): def load_grammar(self, grammar_text: str, grammar_name: str="", mangle: Optional[Callable[[str], str]]=None) -> None: tree = _parse_grammar(grammar_text, grammar_name) - imports: Dict[Tuple[str, ...], Tuple[Optional[str], Dict[str, str]]] = {} for stmt in tree.children: diff --git a/tests/__main__.py b/tests/__main__.py index 80e51d21f..6ed44ffae 100644 --- a/tests/__main__.py +++ b/tests/__main__.py @@ -8,14 +8,12 @@ from .test_trees import TestTrees from .test_tools import TestStandalone from .test_cache import TestCache -from .test_grammar import TestGrammar -from .test_lark_lark import TestLarkLark -from .test_ignore import TestIgnore from .test_reconstructor import TestReconstructor from .test_tree_forest_transformer import TestTreeForestTransformer from .test_lexer import TestLexer from .test_python_grammar import TestPythonParser from .test_tree_templates import * # We define __all__ to list which TestSuites to run +from .test_lark_validator import TestLarkValidator try: from .test_nearley.test_nearley import TestNearley @@ -28,6 +26,7 @@ from .test_logger import Testlogger from .test_parser import * # We define __all__ to list which TestSuites to run +from .test_grammar import * # We define __all__ to list which TestSuites to run if sys.version_info >= (3, 10): from .test_pattern_matching import TestPatternMatching diff --git a/tests/test_grammar.py b/tests/test_grammar.py index 6ac7b5e30..8a4b75c88 100644 --- a/tests/test_grammar.py +++ b/tests/test_grammar.py @@ -1,337 +1,437 @@ from __future__ import absolute_import +import re import os -from unittest import TestCase, main +from unittest import TestCase, main, SkipTest from lark import Lark, Token, Tree, ParseError, UnexpectedInput from lark.load_grammar import GrammarError, GRAMMAR_ERRORS, find_grammar_errors, list_grammar_imports from lark.load_grammar import FromPackageLoader -from lark.lark_validator_visitor import LarkValidatorVisitor +from lark.lark_validator import LarkValidator + + +__all__ = [] + +class LarkDotLark: + def __init__(self, grammar, **kwargs): + options = {} + options.update(kwargs) + if "start" in options and options["start"] != "start": + # We're not going to parse with the parser, so just override it. + options["start"] = "start" + options["propagate_positions"] = True + lark_parser = Lark.open_from_package("lark", "grammars/lark.lark", **options) + tree = lark_parser.parse(grammar) + LarkValidator.validate(tree) + + def parse(self, text: str, start=None, on_error=None): + raise SkipTest("Cannot test cases with lark.lark that try to parse using the tested grammar.") + + +def _make_tests(parser): + class _TestGrammar(TestCase): + def test_errors(self): + # TODO test_errors needs a lot of work for lark.lark. + if parser == LarkDotLark: + self.skipTest("test_errors needs a lot of work for lark.lark.") + for msg, examples in GRAMMAR_ERRORS: + for example in examples: + with self.subTest(example=example): + self.assertRaisesRegex(GrammarError, re.escape(msg), parser, example) + + def test_empty_literal(self): + # Issues #888 + self.assertRaisesRegex(GrammarError, "Empty terminals are not allowed \(\"\"\)", parser, "start: \"\"") + + def test_ignore_name(self): + spaces = [] + p = parser(""" + start: "a" "b" + WS: " " + %ignore WS + """, parser='lalr', lexer_callbacks={'WS': spaces.append}) + assert p.parse("a b") == p.parse("a b") + assert len(spaces) == 5 + + def test_override_rule1(self): + # Overrides the 'sep' template in existing grammar to add an optional terminating delimiter + # Thus extending it beyond its original capacity + self.skipTest("Test fails for lark.lark because it doesn't execute %import.") + p = parser(""" + %import .test_templates_import (start, sep) + + %override sep{item, delim}: item (delim item)* delim? + %ignore " " + """, source_path=__file__) + + a = p.parse('[1, 2, 3]') + b = p.parse('[1, 2, 3, ]') + assert a == b + + def test_override_rule2(self): + if parser == LarkDotLark: + self.skipTest("Test fails for lark.lark because it doesn't execute %import.") + self.assertRaisesRegex(GrammarError, "Rule 'delim' used but not defined \(in rule sep\)", parser, """ + %import .test_templates_import (start, sep) + + %override sep{item}: item (delim item)* delim? + """, source_path=__file__) + + def test_override_rule3(self): + self.assertRaisesRegex(GrammarError, "Cannot override a nonexisting rule sep", parser, """ + %override sep{item}: item (delim item)* delim? + """, source_path=__file__) + + def test_override_terminal(self): + if parser == LarkDotLark: + self.skipTest("Test fails for lark.lark because it doesn't execute %import.") + p = parser(""" + + %import .grammars.ab (startab, A, B) + + %override A: "c" + %override B: "d" + """, start='startab', source_path=__file__) + + a = p.parse('cd') + self.assertEqual(a.children[0].children, [Token('A', 'c'), Token('B', 'd')]) + + def test_extend_rule1(self): + if parser == LarkDotLark: + self.skipTest("Test fails for lark.lark because it doesn't execute %import.") + p = parser(""" + %import .grammars.ab (startab, A, B, expr) + + %extend expr: B A + """, start='startab', source_path=__file__) + a = p.parse('abab') + self.assertEqual(a.children[0].children, ['a', Tree('expr', ['b', 'a']), 'b']) + + def test_extend_rule2(self): + self.assertRaisesRegex(GrammarError, "Can't extend rule expr as it wasn't defined before", parser, """ + %extend expr: B A + """) + + def test_extend_term(self): + if parser == LarkDotLark: + self.skipTest("Test fails for lark.lark because it doesn't execute %import.") + p = parser(""" + %import .grammars.ab (startab, A, B, expr) + + %extend A: "c" + """, start='startab', source_path=__file__) + a = p.parse('acbb') + self.assertEqual(a.children[0].children, ['a', Tree('expr', ['c', 'b']), 'b']) + + def test_extend_twice(self): + p = parser(""" + start: x+ + + x: "a" + %extend x: "b" + %extend x: "c" + """) + + assert p.parse("abccbba") == p.parse("cbabbbb") + + def test_undefined_ignore1(self): + g = """!start: "A" + + %ignore B + """ + self.assertRaisesRegex( GrammarError, "Terminals {'B'} were marked to ignore but were not defined!", parser, g) + + def test_undefined_ignore2(self): + g = """!start: "A" + + %ignore start + """ + self.assertRaisesRegex( GrammarError, "Rules aren't allowed inside terminals ", parser, g) + + def test_alias_in_terminal(self): + g = """start: TERM + TERM: "a" -> alias + """ + self.assertRaisesRegex( GrammarError, "Aliasing not allowed in terminals \(You used -> in the wrong place\)", parser, g) + + def test_undefined_rule(self): + self.assertRaisesRegex(GrammarError, "Rule 'a' used but not defined \(in rule start\)", parser, """start: a""") + + def test_undefined_term(self): + self.assertRaisesRegex(GrammarError, "Terminal 'A' used but not defined \(in rule start\)", parser, """start: A""") + + def test_token_multiline_only_works_with_x_flag(self): + g = r"""start: ABC + ABC: / a b c + d + e f + /i + """ + self.assertRaisesRegex( GrammarError, "You can only use newlines in regular expressions with the `x` \(verbose\) flag", parser, g) + + def test_import_custom_sources1(self): + custom_loader = FromPackageLoader(__name__, ('grammars', )) + grammar = """ + start: startab -class TestGrammar(TestCase): - def setUp(self): - pass + %import ab.startab + """ - def test_errors(self): - for msg, examples in GRAMMAR_ERRORS: - for example in examples: - try: - p = Lark(example) - except GrammarError as e: - assert msg in str(e) - else: - assert False, "example did not raise an error" + if parser == LarkDotLark: + self.skipTest("Test fails for lark.lark because it doesn't execute %import.") + p = parser(grammar, import_paths=[custom_loader]) + self.assertEqual(p.parse('ab'), + Tree('start', [Tree('startab', [Tree('ab__expr', [Token('ab__A', 'a'), Token('ab__B', 'b')])])])) - def test_empty_literal(self): - # Issues #888 - self.assertRaises(GrammarError, Lark, "start: \"\"") + def test_import_custom_sources2(self): + custom_loader = FromPackageLoader(__name__, ('grammars', )) - def test_ignore_name(self): - spaces = [] - p = Lark(""" - start: "a" "b" - WS: " " + grammar = """ + start: rule_to_import + + %import test_relative_import_of_nested_grammar__grammar_to_import.rule_to_import + """ + if parser == LarkDotLark: + self.skipTest("Test fails for lark.lark because it doesn't execute %import.") + p = parser(grammar, import_paths=[custom_loader]) + x = p.parse('N') + self.assertEqual(next(x.find_data('rule_to_import')).children, ['N']) + + def test_import_custom_sources3(self): + custom_loader2 = FromPackageLoader(__name__) + grammar = """ + %import .test_relative_import (start, WS) %ignore WS - """, parser='lalr', lexer_callbacks={'WS': spaces.append}) - assert p.parse("a b") == p.parse("a b") - assert len(spaces) == 5 + """ + if parser == LarkDotLark: + self.skipTest("Test fails for lark.lark because it doesn't execute %import.") + p = parser(grammar, import_paths=[custom_loader2], source_path=__file__) # import relative to current file + x = p.parse('12 capybaras') + self.assertEqual(x.children, ['12', 'capybaras']) + + def test_find_grammar_errors1(self): + text = """ + a: rule + b rule + c: rule + B.: "hello" f + D: "okay" + """ + if parser == LarkDotLark: + self.skipTest("Test forces use of Lark.") + + assert [e.line for e, _s in find_grammar_errors(text)] == [3, 5] + + def test_find_grammar_errors2(self): + text = """ + a: rule + b rule + | ok + c: rule + B.: "hello" f + D: "okay" + """ + if parser == LarkDotLark: + self.skipTest("Test forces use of Lark.") + assert [e.line for e, _s in find_grammar_errors(text)] == [3, 4, 6] + + def test_find_grammar_errors3(self): + text = """ + a: rule @#$#@$@&& + b: rule + | ok + c: rule + B: "hello" f @ + D: "okay" + """ - def test_override_rule(self): - # Overrides the 'sep' template in existing grammar to add an optional terminating delimiter - # Thus extending it beyond its original capacity - p = Lark(""" + if parser == LarkDotLark: + self.skipTest("Test forces use of Lark.") + x = find_grammar_errors(text) + assert [e.line for e, _s in find_grammar_errors(text)] == [2, 6] + + def test_ranged_repeat_terms1(self): + g = u"""!start: AAA + AAA: "A"~3 + """ + l = parser(g, parser='lalr') + self.assertEqual(l.parse(u'AAA'), Tree('start', ["AAA"])) + self.assertRaises((ParseError, UnexpectedInput), l.parse, u'AA') + self.assertRaises((ParseError, UnexpectedInput), l.parse, u'AAAA') + + def test_ranged_repeat_terms2(self): + g = u"""!start: AABB CC + AABB: "A"~0..2 "B"~2 + CC: "C"~1..2 + """ + l = parser(g, parser='lalr') + self.assertEqual(l.parse(u'AABBCC'), Tree('start', ['AABB', 'CC'])) + self.assertEqual(l.parse(u'BBC'), Tree('start', ['BB', 'C'])) + self.assertEqual(l.parse(u'ABBCC'), Tree('start', ['ABB', 'CC'])) + self.assertRaises((ParseError, UnexpectedInput), l.parse, u'AAAB') + self.assertRaises((ParseError, UnexpectedInput), l.parse, u'AAABBB') + self.assertRaises((ParseError, UnexpectedInput), l.parse, u'ABB') + self.assertRaises((ParseError, UnexpectedInput), l.parse, u'AAAABB') + + def test_ranged_repeat_large1(self): + g = u"""!start: "A"~60 + """ + l = parser(g, parser='lalr') + if parser == LarkDotLark: + self.skipTest("Test depends on Lark.") + self.assertGreater(len(l.rules), 1, "Expected that more than one rule will be generated") + self.assertEqual(l.parse(u'A' * 60), Tree('start', ["A"] * 60)) + self.assertRaises(ParseError, l.parse, u'A' * 59) + self.assertRaises((ParseError, UnexpectedInput), l.parse, u'A' * 61) + + def test_ranged_repeat_large2(self): + g = u"""!start: "A"~15..100 + """ + l = parser(g, parser='lalr') + for i in range(0, 110): + if 15 <= i <= 100: + self.assertEqual(l.parse(u'A' * i), Tree('start', ['A']*i)) + else: + self.assertRaises(UnexpectedInput, l.parse, u'A' * i) + + def test_ranged_repeat_large3(self): + # 8191 is a Mersenne prime + g = u"""start: "A"~8191 + """ + l = parser(g, parser='lalr') + self.assertEqual(l.parse(u'A' * 8191), Tree('start', [])) + self.assertRaises(UnexpectedInput, l.parse, u'A' * 8190) + self.assertRaises(UnexpectedInput, l.parse, u'A' * 8192) + + def test_large_terminal(self): + g = "start: NUMBERS\n" + g += "NUMBERS: " + '|'.join('"%s"' % i for i in range(0, 1000)) + + l = parser(g, parser='lalr') + for i in (0, 9, 99, 999): + with self.subTest(i=i): + self.assertEqual(l.parse(str(i)), Tree('start', [str(i)])) + for i in (-1, 1000): + with self.subTest(i=i): + self.assertRaises(UnexpectedInput, l.parse, str(i)) + + def test_list_grammar_imports(self): + grammar = """ %import .test_templates_import (start, sep) %override sep{item, delim}: item (delim item)* delim? %ignore " " - """, source_path=__file__) - - a = p.parse('[1, 2, 3]') - b = p.parse('[1, 2, 3, ]') - assert a == b - - self.assertRaises(GrammarError, Lark, """ - %import .test_templates_import (start, sep) - - %override sep{item}: item (delim item)* delim? - """, source_path=__file__) - - self.assertRaises(GrammarError, Lark, """ - %override sep{item}: item (delim item)* delim? - """, source_path=__file__) - - def test_override_terminal(self): - p = Lark(""" - - %import .grammars.ab (startab, A, B) - - %override A: "c" - %override B: "d" - """, start='startab', source_path=__file__) - - a = p.parse('cd') - self.assertEqual(a.children[0].children, [Token('A', 'c'), Token('B', 'd')]) - - def test_extend_rule(self): - p = Lark(""" - %import .grammars.ab (startab, A, B, expr) - - %extend expr: B A - """, start='startab', source_path=__file__) - a = p.parse('abab') - self.assertEqual(a.children[0].children, ['a', Tree('expr', ['b', 'a']), 'b']) - - self.assertRaises(GrammarError, Lark, """ - %extend expr: B A - """) - - def test_extend_term(self): - p = Lark(""" - %import .grammars.ab (startab, A, B, expr) - - %extend A: "c" - """, start='startab', source_path=__file__) - a = p.parse('acbb') - self.assertEqual(a.children[0].children, ['a', Tree('expr', ['c', 'b']), 'b']) - - def test_extend_twice(self): - p = Lark(""" - start: x+ - - x: "a" - %extend x: "b" - %extend x: "c" - """) + """ - assert p.parse("abccbba") == p.parse("cbabbbb") + if parser == LarkDotLark: + self.skipTest("test_list_grammar_imports forces use of Lark.") + imports = list_grammar_imports(grammar, [os.path.dirname(__file__)]) + self.assertEqual({os.path.split(i)[-1] for i in imports}, {'test_templates_import.lark', 'templates.lark'}) - def test_undefined_ignore(self): - g = """!start: "A" + imports = list_grammar_imports('%import common.WS', []) + assert len(imports) == 1 and imports[0].pkg_name == 'lark' - %ignore B + def test_inline_with_expand_single(self): + grammar = r""" + start: _a + !?_a: "A" """ - self.assertRaises( GrammarError, Lark, g) - - g = """!start: "A" - - %ignore start + self.assertRaisesRegex(GrammarError, "Inlined rules \(_rule\) cannot use the \?rule modifier", parser, grammar) + + def test_line_breaks(self): + p = parser(r"""start: "a" \ + "b" + """) + p.parse('ab') + + def test_declare_rule(self): + g = """ + %declare a + start: b + b: "c" """ - self.assertRaises( GrammarError, Lark, g) + self.assertRaisesRegex(GrammarError, "Expecting terminal name", parser, g) - def test_alias_in_terminal(self): - g = """start: TERM - TERM: "a" -> alias + def test_declare_token(self): + g = """ + %declare A + start: b + b: "c" """ - self.assertRaises( GrammarError, Lark, g) - - def test_undefined_rule(self): - self.assertRaises(GrammarError, Lark, """start: a""") - - def test_undefined_term(self): - self.assertRaises(GrammarError, Lark, """start: A""") - - def test_token_multiline_only_works_with_x_flag(self): - g = r"""start: ABC - ABC: / a b c - d - e f - /i - """ - self.assertRaises( GrammarError, Lark, g) - - def test_import_custom_sources(self): - custom_loader = FromPackageLoader(__name__, ('grammars', )) - - grammar = """ - start: startab - - %import ab.startab - """ - - p = Lark(grammar, import_paths=[custom_loader]) - self.assertEqual(p.parse('ab'), - Tree('start', [Tree('startab', [Tree('ab__expr', [Token('ab__A', 'a'), Token('ab__B', 'b')])])])) - - def test_import_custom_sources2(self): - custom_loader = FromPackageLoader(__name__, ('grammars', )) - - grammar = """ - start: rule_to_import - - %import test_relative_import_of_nested_grammar__grammar_to_import.rule_to_import - """ - p = Lark(grammar, import_paths=[custom_loader]) - x = p.parse('N') - self.assertEqual(next(x.find_data('rule_to_import')).children, ['N']) - - def test_import_custom_sources3(self): - custom_loader2 = FromPackageLoader(__name__) - grammar = """ - %import .test_relative_import (start, WS) - %ignore WS - """ - p = Lark(grammar, import_paths=[custom_loader2], source_path=__file__) # import relative to current file - x = p.parse('12 capybaras') - self.assertEqual(x.children, ['12', 'capybaras']) - - def test_find_grammar_errors(self): - text = """ - a: rule - b rule - c: rule - B.: "hello" f - D: "okay" - """ - - assert [e.line for e, _s in find_grammar_errors(text)] == [3, 5] - - text = """ - a: rule - b rule - | ok - c: rule - B.: "hello" f - D: "okay" - """ - - assert [e.line for e, _s in find_grammar_errors(text)] == [3, 4, 6] - - text = """ - a: rule @#$#@$@&& - b: rule - | ok - c: rule - B: "hello" f @ - D: "okay" - """ - - x = find_grammar_errors(text) - assert [e.line for e, _s in find_grammar_errors(text)] == [2, 6] - - def test_ranged_repeat_terms(self): - g = u"""!start: AAA - AAA: "A"~3 + parser(g) + + def test_import_multiple(self): + g = """ + %ignore A B + start: rule1 + rule1: "c" + A: "a" + B: "b" """ - l = Lark(g, parser='lalr') - self.assertEqual(l.parse(u'AAA'), Tree('start', ["AAA"])) - self.assertRaises((ParseError, UnexpectedInput), l.parse, u'AA') - self.assertRaises((ParseError, UnexpectedInput), l.parse, u'AAAA') - - g = u"""!start: AABB CC - AABB: "A"~0..2 "B"~2 - CC: "C"~1..2 + self.assertRaisesRegex(GrammarError, "Bad %ignore - must have a Terminal or other value", parser, g) + + def test_no_rule_aliases_below_top_level(self): + g = """start: rule + rule: ("a" -> alias + | "b") + """ + self.assertRaisesRegex( GrammarError, "Rule 'alias' used but not defined", parser, g) + + def test_no_term_templates(self): + g = """start: TERM + separated{x, sep}: x (sep x)* + TERM: separated{"A", " "} """ - l = Lark(g, parser='lalr') - self.assertEqual(l.parse(u'AABBCC'), Tree('start', ['AABB', 'CC'])) - self.assertEqual(l.parse(u'BBC'), Tree('start', ['BB', 'C'])) - self.assertEqual(l.parse(u'ABBCC'), Tree('start', ['ABB', 'CC'])) - self.assertRaises((ParseError, UnexpectedInput), l.parse, u'AAAB') - self.assertRaises((ParseError, UnexpectedInput), l.parse, u'AAABBB') - self.assertRaises((ParseError, UnexpectedInput), l.parse, u'ABB') - self.assertRaises((ParseError, UnexpectedInput), l.parse, u'AAAABB') - - def test_ranged_repeat_large(self): - g = u"""!start: "A"~60 + self.assertRaisesRegex( GrammarError, "Templates not allowed in terminals", parser, g) + + def test_term_no_call_rule(self): + g = """start: TERM + TERM: rule + rule: "a" """ - l = Lark(g, parser='lalr') - self.assertGreater(len(l.rules), 1, "Expected that more than one rule will be generated") - self.assertEqual(l.parse(u'A' * 60), Tree('start', ["A"] * 60)) - self.assertRaises(ParseError, l.parse, u'A' * 59) - self.assertRaises((ParseError, UnexpectedInput), l.parse, u'A' * 61) + self.assertRaisesRegex( GrammarError, "Rules aren't allowed inside terminals", parser, g) - g = u"""!start: "A"~15..100 + def test_no_rule_modifiers_in_references(self): + g = """start: rule1 + rule1: !?rule2 + rule2: "a" """ - l = Lark(g, parser='lalr') - for i in range(0, 110): - if 15 <= i <= 100: - self.assertEqual(l.parse(u'A' * i), Tree('start', ['A']*i)) - else: - self.assertRaises(UnexpectedInput, l.parse, u'A' * i) - - # 8191 is a Mersenne prime - g = u"""start: "A"~8191 + self.assertRaisesRegex(GrammarError, "Expecting a value", Lark, g) + + def test_rule_modifier_query_bang(self): + g = """start: rule1 + rule1: rule2 + ?!rule2: "a" """ - l = Lark(g, parser='lalr') - self.assertEqual(l.parse(u'A' * 8191), Tree('start', [])) - self.assertRaises(UnexpectedInput, l.parse, u'A' * 8190) - self.assertRaises(UnexpectedInput, l.parse, u'A' * 8192) - - def test_large_terminal(self): - g = "start: NUMBERS\n" - g += "NUMBERS: " + '|'.join('"%s"' % i for i in range(0, 1000)) - - l = Lark(g, parser='lalr') - for i in (0, 9, 99, 999): - self.assertEqual(l.parse(str(i)), Tree('start', [str(i)])) - for i in (-1, 1000): - self.assertRaises(UnexpectedInput, l.parse, str(i)) - - def test_list_grammar_imports(self): - grammar = """ - %import .test_templates_import (start, sep) + parser(g) - %override sep{item, delim}: item (delim item)* delim? - %ignore " " + def test_alias_top_level_ok(self): + g = """ + start: rule1 + rule1: rule2 -> alias2 + rule2: "a" """ + parser(g) - imports = list_grammar_imports(grammar, [os.path.dirname(__file__)]) - self.assertEqual({os.path.split(i)[-1] for i in imports}, {'test_templates_import.lark', 'templates.lark'}) + def test_terminal_alias_bad(self): + g = """ + start: rule1 + rule1: TOKEN2 + TOKEN2: "a" -> alias2 + """ + self.assertRaisesRegex(GrammarError, "Aliasing not allowed in terminals", parser, g) - imports = list_grammar_imports('%import common.WS', []) - assert len(imports) == 1 and imports[0].pkg_name == 'lark' + _NAME = "TestGrammar" + parser.__name__ + _TestGrammar.__name__ = _NAME + _TestGrammar.__qualname__ = _NAME + globals()[_NAME] = _TestGrammar + __all__.append(_NAME) - def test_inline_with_expand_single(self): - grammar = r""" - start: _a - !?_a: "A" - """ - self.assertRaises(GrammarError, Lark, grammar) - - - def test_line_breaks(self): - p = Lark(r"""start: "a" \ - "b" - """) - p.parse('ab') - - def test_declare_rule_lg(self): - g = """ - %declare a - start: b - b: "c" - """ - self.assertRaisesRegex(GrammarError, "Expecting terminal name", Lark, g) - - def test_declare_rule_ll(self): - g = """ - %declare a - start: b - b: "c" - """ - l = Lark.open_from_package("lark", "grammars/lark.lark") - t = l.parse(g) - self.assertRaisesRegex(GrammarError, "Expecting terminal name", LarkValidatorVisitor.validate, t) - - def test_declare_token_lg(self): - g = """ - %declare A - start: b - b: "c" - """ - Lark(g) - - def test_declare_token_ll(self): - g = """ - %declare A - start: b - b: "c" - """ - l = Lark.open_from_package("lark", "grammars/lark.lark") - t = l.parse(g) - LarkValidatorVisitor.validate(t) +for parser in [Lark, LarkDotLark]: + _make_tests(parser) if __name__ == '__main__': main() diff --git a/tests/test_grammar_formal.py b/tests/test_grammar_formal.py deleted file mode 100644 index d657a2364..000000000 --- a/tests/test_grammar_formal.py +++ /dev/null @@ -1,87 +0,0 @@ -from __future__ import absolute_import - -from unittest import TestCase, main - -from lark import lark, Lark, UnexpectedToken -from lark.load_grammar import GrammarError - - -# Based on TestGrammar, with lots of tests that can't be run deleted. -class TestGrammarFormal(TestCase): - def setUp(self): - self.lark_parser = Lark.open_from_package("lark", "grammars/lark.lark", parser="lalr") - - def test_errors(self): - # This is an unrolled form of the test_grammar.py:GRAMMAR_ERRORS tests, because the lark.lark messages vary. - - # 'Incorrect type of value', 'a: 1\n' - self.assertRaisesRegex(UnexpectedToken, 'Unexpected token Token..NUMBER., .1..', self.lark_parser.parse, 'a: 1\n') - # 'Unclosed parenthesis', 'a: (\n' - self.assertRaisesRegex(UnexpectedToken, 'Unexpected token Token.._NL.,', self.lark_parser.parse, 'a: (\n') - # 'Unmatched closing parenthesis', 'a: )\n' - self.assertRaisesRegex(UnexpectedToken, 'Unexpected token Token..RPAR.', self.lark_parser.parse, 'a: )\n') - # 'Unmatched closing parenthesis', 'a: )\n' - self.assertRaisesRegex(UnexpectedToken, 'Unexpected token Token..RPAR.,', self.lark_parser.parse, 'a: )\n') - # 'Unmatched closing parenthesis', 'a: (\n' - self.assertRaisesRegex(UnexpectedToken, 'Unexpected token Token.._NL.,', self.lark_parser.parse, 'a: (\n') - # 'Expecting rule or terminal definition (missing colon)', 'a\n' - self.assertRaisesRegex(UnexpectedToken, 'Unexpected token Token.._NL.,', self.lark_parser.parse, 'a\n') - # 'Expecting rule or terminal definition (missing colon)', 'A\n' - self.assertRaisesRegex(UnexpectedToken, 'Unexpected token Token.._NL.,', self.lark_parser.parse, 'A\n') - # 'Expecting rule or terminal definition (missing colon)', 'a->\n' - self.assertRaisesRegex(UnexpectedToken, 'Unexpected token Token..__ANON_0., .->', self.lark_parser.parse, 'a->\n') - # 'Expecting rule or terminal definition (missing colon)', 'A->\n' - self.assertRaisesRegex(UnexpectedToken, 'Unexpected token Token..__ANON_0., .->', self.lark_parser.parse, 'A->\n') - # 'Expecting rule or terminal definition (missing colon)', 'a A\n' - self.assertRaisesRegex(UnexpectedToken, 'Unexpected token Token..TOKEN., .A..', self.lark_parser.parse, 'a A\n') - # 'Illegal name for rules or terminals', 'Aa:\n' - self.assertRaisesRegex(UnexpectedToken, 'Unexpected token Token..RULE., .a..', self.lark_parser.parse, 'Aa:\n') - # 'Alias expects lowercase name', 'a: -> "a"\n' - self.assertRaisesRegex(UnexpectedToken, 'Unexpected token Token..STRING., ."a"..', self.lark_parser.parse, 'a: -> "a"\n') - # 'Unexpected colon', 'a::\n' - self.assertRaisesRegex(UnexpectedToken, 'Unexpected token Token..COLON.,', self.lark_parser.parse, 'a::\n') - # 'Unexpected colon', 'a: b:\n' - self.assertRaisesRegex(UnexpectedToken, 'Unexpected token Token..COLON.,', self.lark_parser.parse, 'a: b:\n') - # 'Unexpected colon', 'a: B:\n' - self.assertRaisesRegex(UnexpectedToken, 'Unexpected token Token..COLON.,', self.lark_parser.parse, 'a: B:\n') - # 'Unexpected colon', 'a: "a":\n' - self.assertRaisesRegex(UnexpectedToken, 'Unexpected token Token..COLON.,', self.lark_parser.parse, 'a: "a":\n') - # 'Misplaced operator', 'a: b??' - self.assertRaisesRegex(UnexpectedToken, 'Unexpected token Token..OP., .\?..', self.lark_parser.parse, 'a: b??') - # 'Misplaced operator', 'a: b(?)' - self.assertRaisesRegex(UnexpectedToken, 'Unexpected token Token..OP., .\?..', self.lark_parser.parse, 'a: b(?)') - # 'Misplaced operator', 'a:+\n' - self.assertRaisesRegex(UnexpectedToken, 'Unexpected token Token..OP., .\+..', self.lark_parser.parse, 'a:+\n') - # 'Misplaced operator', 'a:?\n' - self.assertRaisesRegex(UnexpectedToken, 'Unexpected token Token..OP., .\?..', self.lark_parser.parse, 'a:?\n') - # 'Misplaced operator', 'a:*\n' - self.assertRaisesRegex(UnexpectedToken, 'Unexpected token Token..OP., .\*..', self.lark_parser.parse, 'a:*\n') - # 'Misplaced operator', 'a:|*\n' - self.assertRaisesRegex(UnexpectedToken, 'Unexpected token Token..OP., .\*..', self.lark_parser.parse, 'a:|*\n') - # 'Expecting option ("|") or a new rule or terminal definition', 'a:a\n()\n' - self.assertRaisesRegex(UnexpectedToken, 'Unexpected token Token..LPAR.,', self.lark_parser.parse, 'a:a\n()\n') - # 'Terminal names cannot contain dots', 'A.B\n' - self.assertRaisesRegex(UnexpectedToken, 'Unexpected token Token..TOKEN., .B..', self.lark_parser.parse, 'A.B\n') - # 'Expecting rule or terminal definition', '"a"\n' - self.assertRaisesRegex(UnexpectedToken, 'Unexpected token Token..STRING., ."a"..', self.lark_parser.parse, '"a"\n') - # '%import expects a name', '%import "a"\n' - self.assertRaisesRegex(UnexpectedToken, 'Unexpected token Token..STRING., ."a"..', self.lark_parser.parse, '%import "a"\n') - # '%ignore expects a value', '%ignore %import\n' - self.assertRaisesRegex(UnexpectedToken, 'Unexpected token Token..__ANON_2., .%import..', self.lark_parser.parse, '%ignore %import\n') - - def test_alias_in_terminal(self): - g = """start: TERM - TERM: "a" -> alias - """ - self.assertRaisesRegex( UnexpectedToken, "Unexpected token Token.'__ANON_0', '->'.", self.lark_parser.parse, g) - - def test_inline_with_expand_single(self): - grammar = r""" - start: _a - !?_a: "A" - """ - self.assertRaisesRegex(UnexpectedToken, "Unexpected token Token.'OP', '?'.", self.lark_parser.parse, grammar) - - -if __name__ == '__main__': - main() diff --git a/tests/test_ignore.py b/tests/test_ignore.py deleted file mode 100644 index 94712f1c7..000000000 --- a/tests/test_ignore.py +++ /dev/null @@ -1,42 +0,0 @@ -from __future__ import absolute_import - -import os -from unittest import TestCase, main - -from lark import lark, Lark, UnexpectedToken, UnexpectedCharacters -from lark.load_grammar import GrammarError -from lark.lark_validator_visitor import LarkValidatorVisitor - - -# Test that certain previous differences between load_grammar.py and -# grammars/lark.lark have been resolved. -class TestIgnore(TestCase): - def setUp(self): - lark_path = os.path.join(os.path.dirname(lark.__file__), 'grammars/lark.lark') - self.lark_parser = Lark.open(lark_path, parser="lalr") - - def test_load_grammar_import_multiple(self): - g = """ - %ignore A B - start: rule1 - rule1: "c" - A: "a" - B: "b" - """ - l = Lark(g) - self.assertRaisesRegex(UnexpectedCharacters, "No terminal matches 'b' in the current parser context", l.parse, "badbadbad") - - def test_lark_lark_ignore_multiple(self): - g = """ - %ignore A B - start: rule1 - rule1: "c" - A: "a" - B: "b" - """ - t = self.lark_parser.parse(g) - self.assertRaisesRegex(GrammarError, "Bad %ignore - must have a Terminal or other value", LarkValidatorVisitor.validate, t) - - -if __name__ == '__main__': - main() diff --git a/tests/test_lark_lark.py b/tests/test_lark_lark.py deleted file mode 100644 index b7a990e7c..000000000 --- a/tests/test_lark_lark.py +++ /dev/null @@ -1,209 +0,0 @@ -from __future__ import absolute_import - -from unittest import TestCase, main - -from lark import lark, Lark, UnexpectedToken -from lark.load_grammar import GrammarError -from lark.lark_validator_visitor import LarkValidatorVisitor - - -# Test that certain previous differences between load_grammar.py and -# grammars/lark.lark have been resolved. -class TestLarkLark(TestCase): - def setUp(self): - self.lark_parser = Lark.open_from_package("lark", "grammars/lark.lark", parser="lalr") - - def test_01_no_alias_in_terminal_lg(self): - g = """start: TERM - TERM: "a" -> alias - """ - self.assertRaisesRegex( GrammarError, "Aliasing not allowed in terminals", Lark, g) - - def test_01_no_alias_in_terminal_ll(self): - # lark.lark allows aliases in terminals, and rejects them if you run the LarkValidatorVisitor. - g = """start: TERM - TERM: "a" -> alias - """ - self.lark_parser.parse(g) - - def test_02_no_rule_aliases_below_top_level_lg(self): - g = """start: rule - rule: ("a" -> alias - | "b") - """ - self.assertRaisesRegex( GrammarError, "Rule 'alias' used but not defined", Lark, g) - - def test_02_no_rule_aliases_below_top_level_ll(self): - # lark.lark allows aliases below top-level, and rejects them if you run the LarkValidatorVisitor. - g = """start: rule - rule: ("a" -> alias - | "b") - """ - self.lark_parser.parse(g) - - def test_04_extend_rule_lg(self): - g = """ - %import .grammars.ab (startab, A, B, expr) - - %extend expr: B A - """ - Lark(g, start='startab', source_path=__file__) - - def test_04_extend_rule_ll(self): - g = """ - %import .grammars.ab (startab, A, B, expr) - - %extend expr: B A - """ - self.lark_parser.parse(g) - - def test_05_extend_term_lg(self): - g = """ - %import .grammars.ab (startab, A, B, expr) - - %extend A: "c" - """ - Lark(g, start='startab', source_path=__file__) - - def test_05_extend_term_ll(self): - g = """ - %import .grammars.ab (startab, A, B, expr) - - %extend A: "c" - """ - self.lark_parser.parse(g) - - def test_06_no_term_templates_lg(self): - g = """start: TERM - separated{x, sep}: x (sep x)* - TERM: separated{"A", " "} - """ - self.assertRaisesRegex( GrammarError, "Templates not allowed in terminals", Lark, g) - - def test_06_no_term_templates_ll(self): - # lark.lark allows templates in terminals, and rejects them if you run the LarkValidatorVisitor. - g = """start: TERM - separated{x, sep}: x (sep x)* - TERM: separated{"A", " "} - """ - self.lark_parser.parse(g) - - def test_07_term_no_call_rule_lg(self): - g = """start: TERM - TERM: rule - rule: "a" - """ - self.assertRaisesRegex( GrammarError, "Rules aren't allowed inside terminals", Lark, g) - - def test_07_term_no_call_rule_ll(self): - # lark.lark allows rules in terminals, and rejects them if you run the LarkValidatorVisitor. - g = """start: TERM - TERM: rule - rule: "a" - """ - self.lark_parser.parse(g) - - def test_08_override_term_lg(self): - g = """ - %import .grammars.ab (startab, A, B, expr) - - %override A: "c" - """ - Lark(g, start='startab', source_path=__file__) - - def test_08_override_term_ll(self): - g = """ - %import .grammars.ab (startab, A, B, expr) - - %override A: "c" - """ - self.lark_parser.parse(g) - - def test_09_no_rule_modifiers_in_references_lg(self): - g = """start: rule1 - rule1: !?rule2 - rule2: "a" - """ - self.assertRaisesRegex(GrammarError, "Expecting a value, at line 2 column 20", Lark, g) - - def test_09_no_rule_modifiers_in_references_ll(self): - g = """start: rule1 - rule1: !rule2 - rule2: "a" - """ - self.assertRaisesRegex( UnexpectedToken, "Unexpected token Token.'RULE_MODIFIERS', '!'.", self.lark_parser.parse, g) - - def test_10_rule_modifier_query_bang_lg(self): - g = """start: rule1 - rule1: rule2 - ?!rule2: "a" - """ - Lark(g) - - def test_10_rule_modifier_query_bang_ll(self): - g = """start: rule1 - rule1: rule2 - ?!rule2: "a" - """ - self.lark_parser.parse(g) - - def test_lark_validator_alias_top_level_ok(self): - g = """ - start: rule1 - rule1: rule2 -> alias2 - """ - t = self.lark_parser.parse(g) - LarkValidatorVisitor.validate(t) - - def test_lark_validator_alias_inner_bad(self): - g = """ - start: rule1 - rule1: rule2 - | (rule3 -> alias3 | rule4) - rule2: "a" - rule3: "b" - rule4: "c" - """ - - t = self.lark_parser.parse(g) - self.assertRaisesRegex( Exception, "Deep aliasing not allowed", LarkValidatorVisitor.validate, t) - - def test_lark_validator_import_multi_token_bad(self): - g = """ - %ignore A B - start: rule1 - rule1: "c" - A: "a" - B: "b" - """ - t = self.lark_parser.parse(g) - self.assertRaisesRegex(GrammarError, "Bad %ignore - must have a Terminal or other value", LarkValidatorVisitor.validate, t) - - def test_lark_validator_terminal_alias_bad(self): - g = """ - start: rule1 - rule1: TOKEN2 - TOKEN2: "a" -> alias2 - """ - t = self.lark_parser.parse(g) - self.assertRaisesRegex(GrammarError, "Aliasing not allowed in terminals", LarkValidatorVisitor.validate, t) - - def test_lark_validator_terminal_rule_bad(self): - g = """start: TERM - TERM: rule - rule: "a" - """ - t = self.lark_parser.parse(g) - self.assertRaisesRegex(GrammarError, "Rules aren't allowed inside terminals", LarkValidatorVisitor.validate, t) - - def test_lark_validator_terminal_template_bad(self): - g = """start: TERM - separated{x, sep}: x (sep x)* - TERM: separated{"A", " "} - """ - t = self.lark_parser.parse(g) - self.assertRaisesRegex(GrammarError, "Templates not allowed in terminals", LarkValidatorVisitor.validate, t) - - -if __name__ == '__main__': - main() diff --git a/tests/test_lark_validator.py b/tests/test_lark_validator.py new file mode 100644 index 000000000..7b929bcab --- /dev/null +++ b/tests/test_lark_validator.py @@ -0,0 +1,12 @@ +from lark import Lark +from lark.lark_validator import LarkValidator +from unittest import TestCase, main, SkipTest + +class TestLarkValidator(TestCase): + def test_example(self): + my_grammar = """ + start: "A" + """ + lark_parser = Lark.open_from_package("lark", "grammars/lark.lark", parser="lalr") + parse_tree = lark_parser.parse(my_grammar) + LarkValidator.validate(parse_tree) \ No newline at end of file From 5f37365b542402ab66d2616d929601be34eadcd4 Mon Sep 17 00:00:00 2001 From: Ross Patterson Date: Thu, 20 Jun 2024 21:28:19 -0400 Subject: [PATCH 16/21] Resolve @megalng comment re:@skipIf --- tests/test_grammar.py | 44 +++++++++++++++---------------------------- 1 file changed, 15 insertions(+), 29 deletions(-) diff --git a/tests/test_grammar.py b/tests/test_grammar.py index 8a4b75c88..6b69b9c26 100644 --- a/tests/test_grammar.py +++ b/tests/test_grammar.py @@ -2,7 +2,7 @@ import re import os -from unittest import TestCase, main, SkipTest +from unittest import TestCase, main, skipIf, SkipTest from lark import Lark, Token, Tree, ParseError, UnexpectedInput from lark.load_grammar import GrammarError, GRAMMAR_ERRORS, find_grammar_errors, list_grammar_imports @@ -30,10 +30,8 @@ def parse(self, text: str, start=None, on_error=None): def _make_tests(parser): class _TestGrammar(TestCase): + @skipIf(parser == LarkDotLark, 'test_errors needs rewriting to work with lark.lark') def test_errors(self): - # TODO test_errors needs a lot of work for lark.lark. - if parser == LarkDotLark: - self.skipTest("test_errors needs a lot of work for lark.lark.") for msg, examples in GRAMMAR_ERRORS: for example in examples: with self.subTest(example=example): @@ -53,10 +51,10 @@ def test_ignore_name(self): assert p.parse("a b") == p.parse("a b") assert len(spaces) == 5 + @skipIf(parser == LarkDotLark, 'Test fails for lark.lark because it does not execute %import.') def test_override_rule1(self): # Overrides the 'sep' template in existing grammar to add an optional terminating delimiter # Thus extending it beyond its original capacity - self.skipTest("Test fails for lark.lark because it doesn't execute %import.") p = parser(""" %import .test_templates_import (start, sep) @@ -68,9 +66,8 @@ def test_override_rule1(self): b = p.parse('[1, 2, 3, ]') assert a == b + @skipIf(parser == LarkDotLark, 'Test fails for lark.lark because it does not execute %import.') def test_override_rule2(self): - if parser == LarkDotLark: - self.skipTest("Test fails for lark.lark because it doesn't execute %import.") self.assertRaisesRegex(GrammarError, "Rule 'delim' used but not defined \(in rule sep\)", parser, """ %import .test_templates_import (start, sep) @@ -82,9 +79,8 @@ def test_override_rule3(self): %override sep{item}: item (delim item)* delim? """, source_path=__file__) + @skipIf(parser == LarkDotLark, 'Test fails for lark.lark because it does not execute %import.') def test_override_terminal(self): - if parser == LarkDotLark: - self.skipTest("Test fails for lark.lark because it doesn't execute %import.") p = parser(""" %import .grammars.ab (startab, A, B) @@ -96,9 +92,8 @@ def test_override_terminal(self): a = p.parse('cd') self.assertEqual(a.children[0].children, [Token('A', 'c'), Token('B', 'd')]) + @skipIf(parser == LarkDotLark, 'Test fails for lark.lark because it does not execute %import.') def test_extend_rule1(self): - if parser == LarkDotLark: - self.skipTest("Test fails for lark.lark because it doesn't execute %import.") p = parser(""" %import .grammars.ab (startab, A, B, expr) @@ -112,9 +107,8 @@ def test_extend_rule2(self): %extend expr: B A """) + @skipIf(parser == LarkDotLark, 'Test fails for lark.lark because it does not execute %import.') def test_extend_term(self): - if parser == LarkDotLark: - self.skipTest("Test fails for lark.lark because it doesn't execute %import.") p = parser(""" %import .grammars.ab (startab, A, B, expr) @@ -169,6 +163,7 @@ def test_token_multiline_only_works_with_x_flag(self): """ self.assertRaisesRegex( GrammarError, "You can only use newlines in regular expressions with the `x` \(verbose\) flag", parser, g) + @skipIf(parser == LarkDotLark, 'Test fails for lark.lark because it does not execute %import.') def test_import_custom_sources1(self): custom_loader = FromPackageLoader(__name__, ('grammars', )) @@ -178,12 +173,11 @@ def test_import_custom_sources1(self): %import ab.startab """ - if parser == LarkDotLark: - self.skipTest("Test fails for lark.lark because it doesn't execute %import.") p = parser(grammar, import_paths=[custom_loader]) self.assertEqual(p.parse('ab'), Tree('start', [Tree('startab', [Tree('ab__expr', [Token('ab__A', 'a'), Token('ab__B', 'b')])])])) + @skipIf(parser == LarkDotLark, 'Test fails for lark.lark because it does not execute %import.') def test_import_custom_sources2(self): custom_loader = FromPackageLoader(__name__, ('grammars', )) @@ -192,24 +186,22 @@ def test_import_custom_sources2(self): %import test_relative_import_of_nested_grammar__grammar_to_import.rule_to_import """ - if parser == LarkDotLark: - self.skipTest("Test fails for lark.lark because it doesn't execute %import.") p = parser(grammar, import_paths=[custom_loader]) x = p.parse('N') self.assertEqual(next(x.find_data('rule_to_import')).children, ['N']) + @skipIf(parser == LarkDotLark, 'Test fails for lark.lark because it does not execute %import.') def test_import_custom_sources3(self): custom_loader2 = FromPackageLoader(__name__) grammar = """ %import .test_relative_import (start, WS) %ignore WS """ - if parser == LarkDotLark: - self.skipTest("Test fails for lark.lark because it doesn't execute %import.") p = parser(grammar, import_paths=[custom_loader2], source_path=__file__) # import relative to current file x = p.parse('12 capybaras') self.assertEqual(x.children, ['12', 'capybaras']) + @skipIf(parser == LarkDotLark, 'Test forces use of Lark.') def test_find_grammar_errors1(self): text = """ a: rule @@ -218,11 +210,10 @@ def test_find_grammar_errors1(self): B.: "hello" f D: "okay" """ - if parser == LarkDotLark: - self.skipTest("Test forces use of Lark.") assert [e.line for e, _s in find_grammar_errors(text)] == [3, 5] + @skipIf(parser == LarkDotLark, 'Test forces use of Lark.') def test_find_grammar_errors2(self): text = """ a: rule @@ -233,10 +224,9 @@ def test_find_grammar_errors2(self): D: "okay" """ - if parser == LarkDotLark: - self.skipTest("Test forces use of Lark.") assert [e.line for e, _s in find_grammar_errors(text)] == [3, 4, 6] + @skipIf(parser == LarkDotLark, 'Test forces use of Lark.') def test_find_grammar_errors3(self): text = """ a: rule @#$#@$@&& @@ -247,8 +237,6 @@ def test_find_grammar_errors3(self): D: "okay" """ - if parser == LarkDotLark: - self.skipTest("Test forces use of Lark.") x = find_grammar_errors(text) assert [e.line for e, _s in find_grammar_errors(text)] == [2, 6] @@ -275,12 +263,11 @@ def test_ranged_repeat_terms2(self): self.assertRaises((ParseError, UnexpectedInput), l.parse, u'ABB') self.assertRaises((ParseError, UnexpectedInput), l.parse, u'AAAABB') + @skipIf(parser == LarkDotLark, 'Test depends on Lark.') def test_ranged_repeat_large1(self): g = u"""!start: "A"~60 """ l = parser(g, parser='lalr') - if parser == LarkDotLark: - self.skipTest("Test depends on Lark.") self.assertGreater(len(l.rules), 1, "Expected that more than one rule will be generated") self.assertEqual(l.parse(u'A' * 60), Tree('start', ["A"] * 60)) self.assertRaises(ParseError, l.parse, u'A' * 59) @@ -317,6 +304,7 @@ def test_large_terminal(self): with self.subTest(i=i): self.assertRaises(UnexpectedInput, l.parse, str(i)) + @skipIf(parser == LarkDotLark, 'Test forces use of Lark.') def test_list_grammar_imports(self): grammar = """ %import .test_templates_import (start, sep) @@ -325,8 +313,6 @@ def test_list_grammar_imports(self): %ignore " " """ - if parser == LarkDotLark: - self.skipTest("test_list_grammar_imports forces use of Lark.") imports = list_grammar_imports(grammar, [os.path.dirname(__file__)]) self.assertEqual({os.path.split(i)[-1] for i in imports}, {'test_templates_import.lark', 'templates.lark'}) From 697841b7da28aac8ffc214cfb5c3b029f57fdfa4 Mon Sep 17 00:00:00 2001 From: Ross Patterson Date: Thu, 20 Jun 2024 21:29:10 -0400 Subject: [PATCH 17/21] Resolve @megalng comment re:tests/test_lark_validator.py --- tests/test_lark_validator.py | 12 ------------ 1 file changed, 12 deletions(-) delete mode 100644 tests/test_lark_validator.py diff --git a/tests/test_lark_validator.py b/tests/test_lark_validator.py deleted file mode 100644 index 7b929bcab..000000000 --- a/tests/test_lark_validator.py +++ /dev/null @@ -1,12 +0,0 @@ -from lark import Lark -from lark.lark_validator import LarkValidator -from unittest import TestCase, main, SkipTest - -class TestLarkValidator(TestCase): - def test_example(self): - my_grammar = """ - start: "A" - """ - lark_parser = Lark.open_from_package("lark", "grammars/lark.lark", parser="lalr") - parse_tree = lark_parser.parse(my_grammar) - LarkValidator.validate(parse_tree) \ No newline at end of file From 654e102300faeeba277841c3fe0af75ad66f41e5 Mon Sep 17 00:00:00 2001 From: Ross Patterson Date: Thu, 20 Jun 2024 21:37:21 -0400 Subject: [PATCH 18/21] Resolve @megalng comment re:docstrings --- lark/lark_validator.py | 27 +++++++++++++++++++++++++++ 1 file changed, 27 insertions(+) diff --git a/lark/lark_validator.py b/lark/lark_validator.py index 76d9ea3d7..5bc771bdc 100644 --- a/lark/lark_validator.py +++ b/lark/lark_validator.py @@ -13,9 +13,36 @@ def __init__(self, is_term, tree, params=(), options=None): self.params = tuple(params) class LarkValidator: + """ + Checks a grammar parsed by `lark.lark` for validity using a variety of checks similar to what + `load_grammar.py` does on parser creation. The only stable public entry point is + `LarkValidator.validate(tree)`. + + Checks: + - Illegal constructs not prevented by the grammar: + - `alias` not in the top expansions of a rule + - Incorrect `%ignore` lines + - Invalid literals (like newlines inside of regex without the `x` flag) + - Rules used inside of Terminals + - Undefined symbols + - Incorrectly used templates + """ @classmethod def validate(cls, tree: Tree, options: Dict[str, Any] = {}): + """ + Checks a grammar parsed by `lark.lark` for validity using a variety of checks similar to what + `load_grammar.py` does on parser creation. + + Checks: + - Illegal constructs not prevented by the grammar: + - `alias` not in the top expansions of a rule + - Incorrect `%ignore` lines + - Invalid literals (like newlines inside of regex without the `x` flag) + - Rules used inside of Terminals + - Undefined symbols + - Incorrectly used templates + """ visitor = cls(tree, options) visitor._cross_check_symbols() visitor._resolve_term_references() From 33d7088c59cb704b235b8427666ebf4296e52686 Mon Sep 17 00:00:00 2001 From: Ross Patterson Date: Thu, 20 Jun 2024 21:58:26 -0400 Subject: [PATCH 19/21] Resolve @erezsh comment re:typo --- lark/grammars/lark.lark | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/lark/grammars/lark.lark b/lark/grammars/lark.lark index 6c7d0ed4d..0c072eed8 100644 --- a/lark/grammars/lark.lark +++ b/lark/grammars/lark.lark @@ -1,6 +1,6 @@ # Lark grammar of Lark's syntax # Note: Lark is not bootstrapped, its parser is implemented in load_grammar.py -# This grammar matches that one, but does not enfore some rules that it does. +# This grammar matches that one, but does not enforce some rules that it does. # If you want to enforce those, you can pass the "LarkValidator" over # the parse tree, like this: From 0d01fe224fa05fd3575064247c3e8d749ea2e84f Mon Sep 17 00:00:00 2001 From: Ross Patterson Date: Thu, 20 Jun 2024 22:02:12 -0400 Subject: [PATCH 20/21] Resolve part of @erezsh comment re: options. --- tests/test_grammar.py | 1 - 1 file changed, 1 deletion(-) diff --git a/tests/test_grammar.py b/tests/test_grammar.py index 6b69b9c26..20c1ded85 100644 --- a/tests/test_grammar.py +++ b/tests/test_grammar.py @@ -19,7 +19,6 @@ def __init__(self, grammar, **kwargs): if "start" in options and options["start"] != "start": # We're not going to parse with the parser, so just override it. options["start"] = "start" - options["propagate_positions"] = True lark_parser = Lark.open_from_package("lark", "grammars/lark.lark", **options) tree = lark_parser.parse(grammar) LarkValidator.validate(tree) From 20302ca65239bf952c3a10e4acd26b2f625b2a43 Mon Sep 17 00:00:00 2001 From: Ross Patterson Date: Tue, 24 Sep 2024 10:37:10 -0400 Subject: [PATCH 21/21] Remove obsolete 'options' parameter --- lark/lark_validator.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/lark/lark_validator.py b/lark/lark_validator.py index 5bc771bdc..530165a52 100644 --- a/lark/lark_validator.py +++ b/lark/lark_validator.py @@ -29,7 +29,7 @@ class LarkValidator: """ @classmethod - def validate(cls, tree: Tree, options: Dict[str, Any] = {}): + def validate(cls, tree: Tree): """ Checks a grammar parsed by `lark.lark` for validity using a variety of checks similar to what `load_grammar.py` does on parser creation. @@ -43,13 +43,13 @@ def validate(cls, tree: Tree, options: Dict[str, Any] = {}): - Undefined symbols - Incorrectly used templates """ - visitor = cls(tree, options) + visitor = cls(tree) visitor._cross_check_symbols() visitor._resolve_term_references() visitor._check_literals(tree) return tree - def __init__(self, tree: Tree, options: Dict[str, Any]): + def __init__(self, tree: Tree): self._definitions: Dict[str, Definition] = {} self._ignore_names: List[str] = [] self._load_grammar(tree)