Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
9 changes: 9 additions & 0 deletions lark/lark.py
Original file line number Diff line number Diff line change
Expand Up @@ -309,6 +309,10 @@ def __init__(self, grammar: 'Union[Grammar, str, IO[str]]', **options) -> None:
pass
else:
grammar = read()
# Strip BOM (Byte Order Mark) from file content
if isinstance(grammar, (str, bytes)):
from .utils import strip_bom
grammar = strip_bom(grammar)

cache_fn = None
cache_sha256 = None
Expand Down Expand Up @@ -672,6 +676,11 @@ def parse(self, text: LarkInput, start: Optional[str]=None, on_error: 'Optional[
For convenience, these sub-exceptions also inherit from ``ParserError`` and ``LexerError``.

"""
# Strip BOM (Byte Order Mark) from string and bytes inputs
if isinstance(text, (str, bytes)):
from .utils import strip_bom
text = strip_bom(text)

if on_error is not None and self.options.parser != 'lalr':
raise NotImplementedError("The on_error option is only implemented for the LALR(1) parser.")
return self.parser.parse(text, start=start, on_error=on_error)
Expand Down
29 changes: 29 additions & 0 deletions lark/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -228,6 +228,35 @@ def rindex(self, substr: AnyStr):
TextOrSlice = Union[AnyStr, 'TextSlice[AnyStr]']
LarkInput = Union[AnyStr, TextSlice[AnyStr], Any]


def strip_bom(text: Union[str, bytes]) -> Union[str, bytes]:
"""Strip UTF-8 BOM (Byte Order Mark) from the beginning of text if present.

Args:
text: Input text as str or bytes

Returns:
Text with BOM stripped if it was present, otherwise unchanged text

Examples:
>>> strip_bom('\ufeffHello')
'Hello'
>>> strip_bom(b'\xef\xbb\xbfHello')
b'Hello'
>>> strip_bom('Hello')
'Hello'
"""
if isinstance(text, str):
# UTF-8 BOM as Unicode character U+FEFF
if text.startswith('\ufeff'):
return text[1:]
elif isinstance(text, bytes):
# UTF-8 BOM as bytes sequence 0xEF 0xBB 0xBF
if text.startswith(b'\xef\xbb\xbf'):
return text[3:]

return text

###}


Expand Down
85 changes: 85 additions & 0 deletions tests/test_parser.py
Original file line number Diff line number Diff line change
Expand Up @@ -2762,6 +2762,91 @@ def test_parse_textslice_fails(self):
s = TextSlice("hello", 2, 3)
self.assertRaises(TypeError, parser.parse, s)

def test_bom_handling(self):
"""Test that BOM (Byte Order Mark) is properly stripped from input."""
grammar = """
start: word+
word: WORD
WORD: /\\w+/
%ignore /\\s+/
"""

# Test input without BOM
input_text = "hello world"
parser = _Lark(grammar)
expected_tree = parser.parse(input_text)

# Test string input with UTF-8 BOM
bom_input_str = '\ufeff' + input_text # U+FEFF BOM character
result_str = parser.parse(bom_input_str)
self.assertEqual(result_str, expected_tree, "String input with BOM should parse identically to input without BOM")

# Test bytes input with UTF-8 BOM (if use_bytes is supported)
if LEXER in ('basic', 'contextual'): # Only these lexers support use_bytes
try:
parser_bytes = _Lark(grammar, use_bytes=True)
expected_tree_bytes = parser_bytes.parse(input_text.encode('utf-8'))
bom_input_bytes = b'\xef\xbb\xbf' + input_text.encode('utf-8') # UTF-8 BOM bytes
result_bytes = parser_bytes.parse(bom_input_bytes)
self.assertEqual(result_bytes, expected_tree_bytes, "Bytes input with BOM should parse identically to input without BOM")
except Exception:
# Some lexer/parser combinations might not support use_bytes
pass

# Test that BOM is only stripped from the beginning, not from the middle
input_with_embedded_bom = "hello \ufeff world" # BOM in middle, should not be stripped
with self.assertRaises(Exception, msg="BOM in the middle should cause parsing error since it's not stripped"):
parser.parse(input_with_embedded_bom)

def test_bom_file_handling(self):
"""Test that BOM is properly stripped when reading from files."""
import tempfile
import os

grammar = """
start: word+
word: WORD
WORD: /\\w+/
%ignore /\\s+/
"""

# Create temporary files for testing
normal_content = "hello world"
bom_content = '\ufeff' + normal_content # Add UTF-8 BOM

with tempfile.NamedTemporaryFile(mode='w', encoding='utf-8', suffix='.lark', delete=False) as f:
f.write(bom_content)
bom_file_path = f.name

with tempfile.NamedTemporaryFile(mode='w', encoding='utf-8', suffix='.lark', delete=False) as f:
f.write(normal_content)
normal_file_path = f.name

try:
# Parse using direct text
parser_direct = _Lark(grammar)
expected_tree = parser_direct.parse(normal_content)

# Parse the same grammar from files
parser_normal = _Lark(open(normal_file_path, 'r', encoding='utf-8').read())
parser_bom = _Lark(open(bom_file_path, 'r', encoding='utf-8').read())

# Both should parse the test input identically
result_normal = parser_normal.parse(normal_content)
result_bom = parser_bom.parse(normal_content)

self.assertEqual(result_normal, expected_tree, "Grammar from normal file should work correctly")
self.assertEqual(result_bom, expected_tree, "Grammar from BOM file should work correctly after BOM stripping")
self.assertEqual(result_normal, result_bom, "Both grammars should behave identically")

finally:
# Clean up temporary files
try:
os.unlink(bom_file_path)
os.unlink(normal_file_path)
except OSError:
pass


_NAME = "Test" + PARSER.capitalize() + LEXER.capitalize()
_TestParser.__name__ = _NAME
Expand Down
Loading