coproduct-opensource · brandon-coproduct · Mar 7, 2026
diff --git a/lark/lark.py b/lark/lark.py
@@ -309,6 +309,10 @@ def __init__(self, grammar: 'Union[Grammar, str, IO[str]]', **options) -> None:
             pass
         else:
             grammar = read()
+            # Strip BOM (Byte Order Mark) from file content
+            if isinstance(grammar, (str, bytes)):
+                from .utils import strip_bom
+                grammar = strip_bom(grammar)
 
         cache_fn = None
         cache_sha256 = None
@@ -672,6 +676,11 @@ def parse(self, text: LarkInput, start: Optional[str]=None, on_error: 'Optional[
                 For convenience, these sub-exceptions also inherit from ``ParserError`` and ``LexerError``.
 
         """
+        # Strip BOM (Byte Order Mark) from string and bytes inputs
+        if isinstance(text, (str, bytes)):
+            from .utils import strip_bom
+            text = strip_bom(text)
+
         if on_error is not None and self.options.parser != 'lalr':
             raise NotImplementedError("The on_error option is only implemented for the LALR(1) parser.")
         return self.parser.parse(text, start=start, on_error=on_error)

diff --git a/lark/utils.py b/lark/utils.py
@@ -228,6 +228,35 @@ def rindex(self, substr: AnyStr):
 TextOrSlice = Union[AnyStr, 'TextSlice[AnyStr]']
 LarkInput = Union[AnyStr, TextSlice[AnyStr], Any]
 
+
+def strip_bom(text: Union[str, bytes]) -> Union[str, bytes]:
+    """Strip UTF-8 BOM (Byte Order Mark) from the beginning of text if present.
+
+    Args:
+        text: Input text as str or bytes
+
+    Returns:
+        Text with BOM stripped if it was present, otherwise unchanged text
+
+    Examples:
+        >>> strip_bom('\ufeffHello')
+        'Hello'
+        >>> strip_bom(b'\xef\xbb\xbfHello')
+        b'Hello'
+        >>> strip_bom('Hello')
+        'Hello'
+    """
+    if isinstance(text, str):
+        # UTF-8 BOM as Unicode character U+FEFF
+        if text.startswith('\ufeff'):
+            return text[1:]
+    elif isinstance(text, bytes):
+        # UTF-8 BOM as bytes sequence 0xEF 0xBB 0xBF
+        if text.startswith(b'\xef\xbb\xbf'):
+            return text[3:]
+
+    return text
+
 ###}
 
 

diff --git a/tests/test_parser.py b/tests/test_parser.py
@@ -2762,6 +2762,91 @@ def test_parse_textslice_fails(self):
             s = TextSlice("hello", 2, 3)
             self.assertRaises(TypeError, parser.parse, s)
 
+        def test_bom_handling(self):
+            """Test that BOM (Byte Order Mark) is properly stripped from input."""
+            grammar = """
+            start: word+
+            word: WORD
+            WORD: /\\w+/
+            %ignore /\\s+/
+            """
+
+            # Test input without BOM
+            input_text = "hello world"
+            parser = _Lark(grammar)
+            expected_tree = parser.parse(input_text)
+
+            # Test string input with UTF-8 BOM
+            bom_input_str = '\ufeff' + input_text  # U+FEFF BOM character
+            result_str = parser.parse(bom_input_str)
+            self.assertEqual(result_str, expected_tree, "String input with BOM should parse identically to input without BOM")
+
+            # Test bytes input with UTF-8 BOM (if use_bytes is supported)
+            if LEXER in ('basic', 'contextual'):  # Only these lexers support use_bytes
+                try:
+                    parser_bytes = _Lark(grammar, use_bytes=True)
+                    expected_tree_bytes = parser_bytes.parse(input_text.encode('utf-8'))
+                    bom_input_bytes = b'\xef\xbb\xbf' + input_text.encode('utf-8')  # UTF-8 BOM bytes
+                    result_bytes = parser_bytes.parse(bom_input_bytes)
+                    self.assertEqual(result_bytes, expected_tree_bytes, "Bytes input with BOM should parse identically to input without BOM")
+                except Exception:
+                    # Some lexer/parser combinations might not support use_bytes
+                    pass
+
+            # Test that BOM is only stripped from the beginning, not from the middle
+            input_with_embedded_bom = "hello \ufeff world"  # BOM in middle, should not be stripped
+            with self.assertRaises(Exception, msg="BOM in the middle should cause parsing error since it's not stripped"):
+                parser.parse(input_with_embedded_bom)
+
+        def test_bom_file_handling(self):
+            """Test that BOM is properly stripped when reading from files."""
+            import tempfile
+            import os
+
+            grammar = """
+            start: word+
+            word: WORD
+            WORD: /\\w+/
+            %ignore /\\s+/
+            """
+
+            # Create temporary files for testing
+            normal_content = "hello world"
+            bom_content = '\ufeff' + normal_content  # Add UTF-8 BOM
+
+            with tempfile.NamedTemporaryFile(mode='w', encoding='utf-8', suffix='.lark', delete=False) as f:
+                f.write(bom_content)
+                bom_file_path = f.name
+
+            with tempfile.NamedTemporaryFile(mode='w', encoding='utf-8', suffix='.lark', delete=False) as f:
+                f.write(normal_content)
+                normal_file_path = f.name
+
+            try:
+                # Parse using direct text
+                parser_direct = _Lark(grammar)
+                expected_tree = parser_direct.parse(normal_content)
+
+                # Parse the same grammar from files
+                parser_normal = _Lark(open(normal_file_path, 'r', encoding='utf-8').read())
+                parser_bom = _Lark(open(bom_file_path, 'r', encoding='utf-8').read())
+
+                # Both should parse the test input identically
+                result_normal = parser_normal.parse(normal_content)
+                result_bom = parser_bom.parse(normal_content)
+
+                self.assertEqual(result_normal, expected_tree, "Grammar from normal file should work correctly")
+                self.assertEqual(result_bom, expected_tree, "Grammar from BOM file should work correctly after BOM stripping")
+                self.assertEqual(result_normal, result_bom, "Both grammars should behave identically")
+
+            finally:
+                # Clean up temporary files
+                try:
+                    os.unlink(bom_file_path)
+                    os.unlink(normal_file_path)
+                except OSError:
+                    pass
+
 
     _NAME = "Test" + PARSER.capitalize() + LEXER.capitalize()
     _TestParser.__name__ = _NAME