Skip to content

Commit

Permalink
Reimplementation of end symbol (Issue #237)
Browse files Browse the repository at this point in the history
  • Loading branch information
erezsh committed Apr 18, 2021
1 parent 385c35f commit 51cde70
Show file tree
Hide file tree
Showing 7 changed files with 65 additions and 14 deletions.
1 change: 1 addition & 0 deletions lark/grammar.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
from .utils import Serialize

###{standalone
END = '__$END$__'

class Symbol(Serialize):
__slots__ = ('name',)
Expand Down
10 changes: 9 additions & 1 deletion lark/load_grammar.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,7 @@
from .parse_tree_builder import ParseTreeBuilder
from .parser_frontends import ParsingFrontend
from .common import LexerConf, ParserConf
from .grammar import RuleOptions, Rule, Terminal, NonTerminal, Symbol
from .grammar import RuleOptions, Rule, Terminal, NonTerminal, Symbol, END
from .utils import classify, suppress, dedup_list, Str
from .exceptions import GrammarError, UnexpectedCharacters, UnexpectedToken, ParseError

Expand Down Expand Up @@ -99,6 +99,7 @@
'_EXTEND': r'%extend',
'_IMPORT': r'%import',
'NUMBER': r'[+-]?\d+',
'_END': r'\$',
}

RULES = {
Expand Down Expand Up @@ -135,6 +136,7 @@
'nonterminal',
'literal',
'range',
'end',
'template_usage'],

'terminal': ['TERMINAL'],
Expand All @@ -144,6 +146,7 @@

'maybe': ['_LBRA expansions _RBRA'],
'range': ['STRING _DOTDOT STRING'],
'end': ['_END'],

'template_usage': ['RULE _LBRACE _template_args _RBRACE'],
'_template_args': ['value',
Expand Down Expand Up @@ -791,6 +794,9 @@ def terminal(self, name):
def nonterminal(self, name):
return name

def end(self):
return Token('TERMINAL', END)


def _find_used_symbols(tree):
assert tree.data == 'expansions'
Expand Down Expand Up @@ -938,6 +944,8 @@ def __init__(self, global_keep_all_tokens=False, import_paths=None):
self._definitions = {}
self._ignore_names = []

self._definitions[END] = ((), Tree('expansions', []), self._check_options(END, None))

def _is_term(self, name):
# Imported terminals are of the form `Path__to__Grammar__file__TERMINAL_NAME`
# Only the last part is the actual name, and the rest might contain mixed case
Expand Down
4 changes: 2 additions & 2 deletions lark/parsers/grammar_analysis.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@

from ..utils import bfs, fzset, classify
from ..exceptions import GrammarError
from ..grammar import Rule, Terminal, NonTerminal
from ..grammar import Rule, Terminal, NonTerminal, END


class RulePtr(object):
Expand Down Expand Up @@ -125,7 +125,7 @@ class GrammarAnalyzer(object):
def __init__(self, parser_conf, debug=False):
self.debug = debug

root_rules = {start: Rule(NonTerminal('$root_' + start), [NonTerminal(start), Terminal('$END')])
root_rules = {start: Rule(NonTerminal('$root_' + start), [NonTerminal(start), Terminal(END)])
for start in parser_conf.start}

rules = parser_conf.rules + list(root_rules.values())
Expand Down
4 changes: 2 additions & 2 deletions lark/parsers/lalr_analysis.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,7 @@
from ..exceptions import GrammarError

from .grammar_analysis import GrammarAnalyzer, Terminal, LR0ItemSet
from ..grammar import Rule
from ..grammar import Rule, END

###{standalone

Expand Down Expand Up @@ -177,7 +177,7 @@ def compute_reads_relations(self):
assert(len(root.kernel) == 1)
for rp in root.kernel:
assert(rp.index == 0)
self.directly_reads[(root, rp.next)] = set([ Terminal('$END') ])
self.directly_reads[(root, rp.next)] = set([ Terminal(END) ])

for state in self.lr0_states:
seen = set()
Expand Down
11 changes: 6 additions & 5 deletions lark/parsers/lalr_interactive_parser.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@

from .. import Token
from ..exceptions import UnexpectedToken
from ..grammar import END


class InteractiveParser(object):
Expand All @@ -21,18 +22,18 @@ def feed_token(self, token):
Note that ``token`` has to be an instance of ``Token``.
"""
return self.parser_state.feed_token(token, token.type == '$END')
return self.parser_state.feed_token(token, token.type == END)

def exhaust_lexer(self):
"""Try to feed the rest of the lexer state into the interactive parser.
Note that this modifies the instance in place and does not feed an '$END' Token"""
Note that this modifies the instance in place and does not feed an END Token"""
for token in self.lexer_state.lex(self.parser_state):
self.parser_state.feed_token(token)

def feed_eof(self, last_token=None):
"""Feed a '$END' Token. Borrows from 'last_token' if given."""
eof = Token.new_borrow_pos('$END', '', last_token) if last_token is not None else Token('$END', '', 0, 1, 1)
"""Feed a END Token. Borrows from 'last_token' if given."""
eof = Token.new_borrow_pos(END, '', last_token) if last_token is not None else Token(END, '', 0, 1, 1)
return self.feed_token(eof)


Expand Down Expand Up @@ -116,7 +117,7 @@ def feed_token(self, token):
def exhaust_lexer(self):
"""Try to feed the rest of the lexer state into the parser.
Note that this returns a new ImmutableInteractiveParser and does not feed an '$END' Token"""
Note that this returns a new ImmutableInteractiveParser and does not feed an END Token"""
cursor = self.as_mutable()
cursor.exhaust_lexer()
return cursor.as_immutable()
Expand Down
12 changes: 8 additions & 4 deletions lark/parsers/lalr_parser.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,7 @@
from .lalr_analysis import LALR_Analyzer, Shift, Reduce, IntParseTable
from .lalr_interactive_parser import InteractiveParser
from lark.exceptions import UnexpectedCharacters, UnexpectedInput, UnexpectedToken
from ..grammar import END

###{standalone

Expand Down Expand Up @@ -60,7 +61,7 @@ def parse(self, lexer, start, on_error=None):
return e.interactive_parser.resume_parse()
except UnexpectedToken as e2:
if (isinstance(e, UnexpectedToken)
and e.token.type == e2.token.type == '$END'
and e.token.type == e2.token.type == END
and e.interactive_parser == e2.interactive_parser):
# Prevent infinite loop
raise e2
Expand Down Expand Up @@ -132,7 +133,7 @@ def feed_token(self, token, is_end=False):

if action is Shift:
# shift once and return
assert not is_end
# assert not is_end
state_stack.append(arg)
value_stack.append(token if token.type not in callbacks else callbacks[token.type](token))
return
Expand Down Expand Up @@ -178,8 +179,11 @@ def parse_from_state(self, state):
for token in state.lexer.lex(state):
state.feed_token(token)

token = Token.new_borrow_pos('$END', '', token) if token else Token('$END', '', 0, 1, 1)
return state.feed_token(token, True)
token = Token.new_borrow_pos(END, '', token) if token else Token(END, '', 0, 1, 1)
while True:
x = state.feed_token(token, True)
if x is not None:
return x
except UnexpectedInput as e:
try:
e.interactive_parser = InteractiveParser(self, state, state.lexer)
Expand Down
37 changes: 37 additions & 0 deletions tests/test_parser.py
Original file line number Diff line number Diff line change
Expand Up @@ -2467,6 +2467,43 @@ def ignore_errors(e):
s = "[0 1, 2,@, 3,,, 4, 5 6 ]$"
tree = g.parse(s, on_error=ignore_errors)

@unittest.skipIf(PARSER!='lalr', "Using the end symbol currently works for LALR only")
def test_end_symbol(self):
grammar = """
start: a b?
a: "a" $
b: "b"
"""
parser = _Lark(grammar)

self.assertEqual(parser.parse('a'), Tree('start', [Tree('a', [])]))
self.assertRaises(UnexpectedInput, parser.parse, 'ab')

@unittest.skipIf(PARSER!='lalr', "Using the end symbol currently works for LALR only")
def test_end_symbol2(self):
grammar = """
start: (a|b)+
a: "a" ("x"|$)
b: "b"
"""
parser = _Lark(grammar)

self.assertEqual(parser.parse('axa'), Tree('start', [Tree('a', []),Tree('a', [])]))
self.assertRaises(UnexpectedInput, parser.parse, 'ab')

@unittest.skipIf(PARSER!='lalr', "Using the end symbol currently works for LALR only")
def test_end_symbol3(self):
grammar = """
start: (a|b)+
a: "a" (e|"x")
b: "b"
e: $
"""
parser = _Lark(grammar)

self.assertEqual(parser.parse('axa'), Tree('start', [Tree('a', []),Tree('a', [Tree('e', [])])]))
self.assertRaises(UnexpectedInput, parser.parse, 'ab')


_NAME = "Test" + PARSER.capitalize() + LEXER.capitalize()
_TestParser.__name__ = _NAME
Expand Down

0 comments on commit 51cde70

Please sign in to comment.