diff --git a/docs/classes.md b/docs/classes.md index 8b32801e..7bd92fe6 100644 --- a/docs/classes.md +++ b/docs/classes.md @@ -128,6 +128,7 @@ Useful for caching and multiprocessing. - **priority** - How priorities should be evaluated - auto, none, normal, invert (Default: auto) - **lexer_callbacks** - Dictionary of callbacks for the lexer. May alter tokens during lexing. Use with caution. - **edit_terminals** - A callback +- **use_bytes** - Accept and parse an input of type `bytes` instead of `str`. Grammar should still be specified as `str`, and terminal values are assumed to be `latin-1`. #### Using Unicode character classes with `regex` diff --git a/lark-stubs/lark.pyi b/lark-stubs/lark.pyi index d601fc23..c39ae3d2 100644 --- a/lark-stubs/lark.pyi +++ b/lark-stubs/lark.pyi @@ -31,10 +31,12 @@ class LarkOptions: lexer_callbacks: Dict[str, Callable[[Token], Token]] cache: Union[bool, str] g_regex_flags: int + use_bytes: bool class Lark: source: str + grammar_source: str options: LarkOptions lexer: Lexer terminals: List[TerminalDef] @@ -56,7 +58,8 @@ class Lark: maybe_placeholders: bool = False, lexer_callbacks: Optional[Dict[str, Callable[[Token], Token]]] = None, cache: Union[bool, str] = False, - g_regex_flags: int = ... + g_regex_flags: int = ..., + use_bytes: bool = False, ): ... diff --git a/lark/common.py b/lark/common.py index 5c55b8c2..cc8c73c2 100644 --- a/lark/common.py +++ b/lark/common.py @@ -4,10 +4,10 @@ ###{standalone class LexerConf(Serialize): - __serialize_fields__ = 'tokens', 'ignore', 'g_regex_flags' + __serialize_fields__ = 'tokens', 'ignore', 'g_regex_flags', 'use_bytes' __serialize_namespace__ = TerminalDef, - def __init__(self, tokens, re_module, ignore=(), postlex=None, callbacks=None, g_regex_flags=0, skip_validation=False): + def __init__(self, tokens, re_module, ignore=(), postlex=None, callbacks=None, g_regex_flags=0, skip_validation=False, use_bytes=False): self.tokens = tokens # TODO should be terminals self.ignore = ignore self.postlex = postlex @@ -15,6 +15,7 @@ def __init__(self, tokens, re_module, ignore=(), postlex=None, callbacks=None, g self.g_regex_flags = g_regex_flags self.re_module = re_module self.skip_validation = skip_validation + self.use_bytes = use_bytes def _deserialize(self): self.callbacks = {} # TODO diff --git a/lark/exceptions.py b/lark/exceptions.py index 1c5e533e..033275cc 100644 --- a/lark/exceptions.py +++ b/lark/exceptions.py @@ -28,9 +28,14 @@ def get_context(self, text, span=40): pos = self.pos_in_stream start = max(pos - span, 0) end = pos + span - before = text[start:pos].rsplit('\n', 1)[-1] - after = text[pos:end].split('\n', 1)[0] - return before + after + '\n' + ' ' * len(before) + '^\n' + if not isinstance(text, bytes): + before = text[start:pos].rsplit('\n', 1)[-1] + after = text[pos:end].split('\n', 1)[0] + return before + after + '\n' + ' ' * len(before) + '^\n' + else: + before = text[start:pos].rsplit(b'\n', 1)[-1] + after = text[pos:end].split(b'\n', 1)[0] + return (before + after + b'\n' + b' ' * len(before) + b'^\n').decode("ascii", "backslashreplace") def match_examples(self, parse_fn, examples, token_type_match_fallback=False): """ Given a parser instance and a dictionary mapping some label with @@ -67,7 +72,11 @@ def match_examples(self, parse_fn, examples, token_type_match_fallback=False): class UnexpectedCharacters(LexError, UnexpectedInput): def __init__(self, seq, lex_pos, line, column, allowed=None, considered_tokens=None, state=None, token_history=None): - message = "No terminal defined for '%s' at line %d col %d" % (seq[lex_pos], line, column) + + if isinstance(seq, bytes): + message = "No terminal defined for '%s' at line %d col %d" % (seq[lex_pos:lex_pos+1].decode("ascii", "backslashreplace"), line, column) + else: + message = "No terminal defined for '%s' at line %d col %d" % (seq[lex_pos], line, column) self.line = line self.column = column diff --git a/lark/lark.py b/lark/lark.py index 232dbb75..daab45b0 100644 --- a/lark/lark.py +++ b/lark/lark.py @@ -4,7 +4,7 @@ from io import open -from .utils import STRING_TYPE, Serialize, SerializeMemoizer, FS +from .utils import STRING_TYPE, Serialize, SerializeMemoizer, FS, isascii from .load_grammar import load_grammar from .tree import Tree from .common import LexerConf, ParserConf @@ -82,6 +82,7 @@ class LarkOptions(Serialize): invert (Default: auto) lexer_callbacks - Dictionary of callbacks for the lexer. May alter tokens during lexing. Use with caution. + use_bytes - Accept an input of type `bytes` instead of `str` (Python 3 only). edit_terminals - A callback """ if __doc__: @@ -105,6 +106,7 @@ class LarkOptions(Serialize): 'maybe_placeholders': False, 'edit_terminals': None, 'g_regex_flags': 0, + 'use_bytes': False, } def __init__(self, options_dict): @@ -114,7 +116,7 @@ def __init__(self, options_dict): for name, default in self._defaults.items(): if name in o: value = o.pop(name) - if isinstance(default, bool) and name != 'cache': + if isinstance(default, bool) and name not in ('cache', 'use_bytes'): value = bool(value) else: value = default @@ -187,6 +189,13 @@ def __init__(self, grammar, **options): grammar = read() assert isinstance(grammar, STRING_TYPE) + self.grammar_source = grammar + if self.options.use_bytes: + if not isascii(grammar): + raise ValueError("Grammar must be ascii only, when use_bytes=True") + if sys.version_info[0] == 2 and self.options.use_bytes != 'force': + raise NotImplementedError("`use_bytes=True` may have issues on python2." + "Use `use_bytes='force'` to use it at your own risk.") cache_fn = None if self.options.cache: @@ -196,7 +205,7 @@ def __init__(self, grammar, **options): cache_fn = self.options.cache else: if self.options.cache is not True: - raise ValueError("cache must be bool or str") + raise ValueError("cache argument must be bool or str") unhashable = ('transformer', 'postlex', 'lexer_callbacks', 'edit_terminals') from . import __version__ options_str = ''.join(k+str(v) for k, v in options.items() if k not in unhashable) @@ -252,7 +261,7 @@ def __init__(self, grammar, **options): for t in self.terminals: self.options.edit_terminals(t) - self._terminals_dict = {t.name:t for t in self.terminals} + self._terminals_dict = {t.name: t for t in self.terminals} # If the user asked to invert the priorities, negate them all here. # This replaces the old 'resolve__antiscore_sum' option. @@ -276,7 +285,7 @@ def __init__(self, grammar, **options): if hasattr(t, term.name): lexer_callbacks[term.name] = getattr(t, term.name) - self.lexer_conf = LexerConf(self.terminals, re_module, self.ignore_tokens, self.options.postlex, lexer_callbacks, self.options.g_regex_flags) + self.lexer_conf = LexerConf(self.terminals, re_module, self.ignore_tokens, self.options.postlex, lexer_callbacks, self.options.g_regex_flags, use_bytes=self.options.use_bytes) if self.options.parser: self.parser = self._build_parser() diff --git a/lark/lexer.py b/lark/lexer.py index 49795000..c77207bb 100644 --- a/lark/lexer.py +++ b/lark/lexer.py @@ -139,8 +139,8 @@ def __eq__(self, other): class LineCounter: - def __init__(self): - self.newline_char = '\n' + def __init__(self, newline_char): + self.newline_char = newline_char self.char_pos = 0 self.line = 1 self.column = 1 @@ -169,7 +169,7 @@ def __init__(self, lexer, state=None): def lex(self, stream, newline_types, ignore_types): newline_types = frozenset(newline_types) ignore_types = frozenset(ignore_types) - line_ctr = LineCounter() + line_ctr = LineCounter('\n' if not self.lexer.use_bytes else b'\n') last_token = None while line_ctr.char_pos < len(stream): @@ -230,7 +230,7 @@ def __call__(self, t): -def _create_unless(terminals, g_regex_flags, re_): +def _create_unless(terminals, g_regex_flags, re_, use_bytes): tokens_by_type = classify(terminals, lambda t: type(t.pattern)) assert len(tokens_by_type) <= 2, tokens_by_type.keys() embedded_strs = set() @@ -247,31 +247,34 @@ def _create_unless(terminals, g_regex_flags, re_): if strtok.pattern.flags <= retok.pattern.flags: embedded_strs.add(strtok) if unless: - callback[retok.name] = UnlessCallback(build_mres(unless, g_regex_flags, re_, match_whole=True)) + callback[retok.name] = UnlessCallback(build_mres(unless, g_regex_flags, re_, match_whole=True, use_bytes=use_bytes)) terminals = [t for t in terminals if t not in embedded_strs] return terminals, callback -def _build_mres(terminals, max_size, g_regex_flags, match_whole, re_): +def _build_mres(terminals, max_size, g_regex_flags, match_whole, re_, use_bytes): # Python sets an unreasonable group limit (currently 100) in its re module # Worse, the only way to know we reached it is by catching an AssertionError! # This function recursively tries less and less groups until it's successful. postfix = '$' if match_whole else '' mres = [] while terminals: + pattern = u'|'.join(u'(?P<%s>%s)' % (t.name, t.pattern.to_regexp() + postfix) for t in terminals[:max_size]) + if use_bytes: + pattern = pattern.encode('latin-1') try: - mre = re_.compile(u'|'.join(u'(?P<%s>%s)'%(t.name, t.pattern.to_regexp()+postfix) for t in terminals[:max_size]), g_regex_flags) + mre = re_.compile(pattern, g_regex_flags) except AssertionError: # Yes, this is what Python provides us.. :/ - return _build_mres(terminals, max_size//2, g_regex_flags, match_whole, re_) + return _build_mres(terminals, max_size//2, g_regex_flags, match_whole, re_, use_bytes) # terms_from_name = {t.name: t for t in terminals[:max_size]} mres.append((mre, {i:n for n,i in mre.groupindex.items()} )) terminals = terminals[max_size:] return mres -def build_mres(terminals, g_regex_flags, re_, match_whole=False): - return _build_mres(terminals, len(terminals), g_regex_flags, match_whole, re_) +def build_mres(terminals, g_regex_flags, re_, use_bytes, match_whole=False): + return _build_mres(terminals, len(terminals), g_regex_flags, match_whole, re_, use_bytes) def _regexp_has_newline(r): r"""Expressions that may indicate newlines in a regexp: @@ -321,12 +324,13 @@ def __init__(self, conf): self.terminals = terminals self.user_callbacks = conf.callbacks self.g_regex_flags = conf.g_regex_flags + self.use_bytes = conf.use_bytes self._mres = None # self.build(g_regex_flags) def _build(self): - terminals, self.callback = _create_unless(self.terminals, self.g_regex_flags, re_=self.re) + terminals, self.callback = _create_unless(self.terminals, self.g_regex_flags, re_=self.re, use_bytes=self.use_bytes) assert all(self.callback.values()) for type_, f in self.user_callbacks.items(): @@ -336,7 +340,7 @@ def _build(self): else: self.callback[type_] = f - self._mres = build_mres(terminals, self.g_regex_flags, self.re) + self._mres = build_mres(terminals, self.g_regex_flags, self.re, self.use_bytes) @property def mres(self): @@ -365,7 +369,8 @@ def __init__(self, conf, states, always_accept=()): assert t.name not in tokens_by_name, t tokens_by_name[t.name] = t - trad_conf = type(conf)(terminals, conf.re_module, conf.ignore, callbacks=conf.callbacks, g_regex_flags=conf.g_regex_flags, skip_validation=conf.skip_validation) + trad_conf = copy(conf) + trad_conf.tokens = terminals lexer_by_tokens = {} self.lexers = {} diff --git a/lark/parser_frontends.py b/lark/parser_frontends.py index c05f2350..33ad9bc7 100644 --- a/lark/parser_frontends.py +++ b/lark/parser_frontends.py @@ -189,6 +189,8 @@ def _prepare_match(self, lexer_conf): else: if width == 0: raise ValueError("Dynamic Earley doesn't allow zero-width regexps", t) + if lexer_conf.use_bytes: + regexp = regexp.encode('utf-8') self.regexps[t.name] = lexer_conf.re_module.compile(regexp, lexer_conf.g_regex_flags) diff --git a/lark/utils.py b/lark/utils.py index 36f50d1e..c70b947e 100644 --- a/lark/utils.py +++ b/lark/utils.py @@ -305,4 +305,17 @@ def combine_alternatives(lists): class FS: open = open - exists = os.path.exists \ No newline at end of file + exists = os.path.exists + + + +def isascii(s): + """ str.isascii only exists in python3.7+ """ + try: + return s.isascii() + except AttributeError: + try: + s.encode('ascii') + return True + except (UnicodeDecodeError, UnicodeEncodeError): + return False \ No newline at end of file diff --git a/tests/test_parser.py b/tests/test_parser.py index 1249211c..f1e269f5 100644 --- a/tests/test_parser.py +++ b/tests/test_parser.py @@ -8,7 +8,9 @@ import sys from copy import copy, deepcopy -from lark.utils import Py36 +from lark.utils import Py36, isascii + +from lark import Token try: from cStringIO import StringIO as cStringIO @@ -561,12 +563,84 @@ def __init__(self, lexer_conf): def lex(self, *args, **kwargs): return self.lexer.lex(*args, **kwargs) +def _tree_structure_check(a, b): + """ + Checks that both Tree objects have the same structure, without checking their values. + """ + assert a.data == b.data and len(a.children) == len(b.children) + for ca,cb in zip(a.children, b.children): + assert type(ca) == type(cb) + if isinstance(ca, Tree): + _tree_structure_check(ca, cb) + elif isinstance(ca, Token): + assert ca.type == cb.type + else: + assert ca == cb + +class DualBytesLark: + """ + A helper class that wraps both a normal parser, and a parser for bytes. + It automatically transforms `.parse` calls for both lexer, returning the value from the text lexer + It always checks that both produce the same output/error + + NOTE: Not currently used, but left here for future debugging. + """ + + def __init__(self, g, *args, **kwargs): + self.text_lexer = Lark(g, *args, use_bytes=False, **kwargs) + g = self.text_lexer.grammar_source.lower() + if '\\u' in g or not isascii(g): + # Bytes re can't deal with uniode escapes + self.bytes_lark = None + else: + # Everything here should work, so use `use_bytes='force'` + self.bytes_lark = Lark(self.text_lexer.grammar_source, *args, use_bytes='force', **kwargs) + + def parse(self, text, start=None): + # TODO: Easy workaround, more complex checks would be beneficial + if not isascii(text) or self.bytes_lark is None: + return self.text_lexer.parse(text, start) + try: + rv = self.text_lexer.parse(text, start) + except Exception as e: + try: + self.bytes_lark.parse(text.encode(), start) + except Exception as be: + assert type(e) == type(be), "Parser with and without `use_bytes` raise different exceptions" + raise e + assert False, "Parser without `use_bytes` raises exception, with doesn't" + try: + bv = self.bytes_lark.parse(text.encode(), start) + except Exception as be: + assert False, "Parser without `use_bytes` doesn't raise an exception, with does" + _tree_structure_check(rv, bv) + return rv + + @classmethod + def open(cls, grammar_filename, rel_to=None, **options): + if rel_to: + basepath = os.path.dirname(rel_to) + grammar_filename = os.path.join(basepath, grammar_filename) + with open(grammar_filename, encoding='utf8') as f: + return cls(f, **options) + + def save(self,f): + self.text_lexer.save(f) + if self.bytes_lark is not None: + self.bytes_lark.save(f) + + def load(self,f): + self.text_lexer = self.text_lexer.load(f) + if self.bytes_lark is not None: + self.bytes_lark.load(f) + def _make_parser_test(LEXER, PARSER): lexer_class_or_name = CustomLexer if LEXER == 'custom' else LEXER def _Lark(grammar, **kwargs): return Lark(grammar, lexer=lexer_class_or_name, parser=PARSER, propagate_positions=True, **kwargs) def _Lark_open(gfilename, **kwargs): return Lark.open(gfilename, lexer=lexer_class_or_name, parser=PARSER, propagate_positions=True, **kwargs) + class _TestParser(unittest.TestCase): def test_basic1(self): g = _Lark("""start: a+ b a* "b" a* @@ -647,6 +721,28 @@ def test_hex_literal_range_escape(self): """) g.parse('\x01\x02\x03') + @unittest.skipIf(sys.version_info[:2]==(2, 7), "bytes parser isn't perfect in Python2.7, exceptions don't work correctly") + def test_bytes_utf8(self): + g = r""" + start: BOM? char+ + BOM: "\xef\xbb\xbf" + char: CHAR1 | CHAR2 | CHAR3 | CHAR4 + CONTINUATION_BYTE: "\x80" .. "\xbf" + CHAR1: "\x00" .. "\x7f" + CHAR2: "\xc0" .. "\xdf" CONTINUATION_BYTE + CHAR3: "\xe0" .. "\xef" CONTINUATION_BYTE CONTINUATION_BYTE + CHAR4: "\xf0" .. "\xf7" CONTINUATION_BYTE CONTINUATION_BYTE CONTINUATION_BYTE + """ + g = _Lark(g, use_bytes=True) + s = u"🔣 地? gurīn".encode('utf-8') + self.assertEqual(len(g.parse(s).children), 10) + + for enc, j in [("sjis", u"地球の絵はグリーンでグッド? Chikyuu no e wa guriin de guddo"), + ("sjis", u"売春婦"), + ("euc-jp", u"乂鵬鵠")]: + s = j.encode(enc) + self.assertRaises(UnexpectedCharacters, g.parse, s) + @unittest.skipIf(PARSER == 'cyk', "Takes forever") def test_stack_for_ebnf(self): """Verify that stack depth isn't an issue for EBNF grammars""" @@ -1065,7 +1161,7 @@ def test_regex_quote(self): g = _Lark(g) self.assertEqual( g.parse('"hello"').children, ['"hello"']) self.assertEqual( g.parse("'hello'").children, ["'hello'"]) - + @unittest.skipIf(not Py36, "Required re syntax only exists in python3.6+") def test_join_regex_flags(self): g = r""" @@ -1078,7 +1174,7 @@ def test_join_regex_flags(self): self.assertEqual(g.parse(" ").children,[" "]) self.assertEqual(g.parse("\n ").children,["\n "]) self.assertRaises(UnexpectedCharacters, g.parse, "\n\n") - + g = r""" start: A A: B | C