python-parsy · tsani · Jan 19, 2025 · Jan 19, 2025 · Jan 19, 2025 · Jan 19, 2025
diff --git a/docs/ref/methods_and_combinators.rst b/docs/ref/methods_and_combinators.rst
@@ -23,7 +23,7 @@ can be used and manipulated as below.
    The following methods are for actually **using** the parsers that you have
    created:
 
-   .. method:: parse(string_or_list)
+   .. method:: parse(string_or_list[, source=None])
 
       Attempts to parse the given string (or list). If the parse is successful
       and consumes the entire string, the result is returned - otherwise, a
@@ -36,7 +36,11 @@ can be used and manipulated as below.
       library will work with tokens just as well. See :doc:`/howto/lexing` for
       more information.
 
-   .. method:: parse_partial(string_or_list)
+      When a non-None ``source`` is given, this name is reported automatically
+      in parse errors. Typically, this is the file path or URL where the data
+      to parse originates from.
+
+   .. method:: parse_partial(string_or_list[, source=None])
 
       Similar to ``parse``, except that it does not require the entire
       string (or list) to be consumed. Returns a tuple of
@@ -401,6 +405,20 @@ can be used and manipulated as below.
       </howto/lexing/>` and want subsequent parsing of the token stream to be
       able to report original positions in error messages etc.
 
+   .. method:: span()
+
+      Returns a parser that augments the initial parser's result with a :class:`SourceSpan`
+      containing information about where that parser started and stopped within the
+      source data. The new value is a tuple:
+
+      .. code:: python
+
+         (source_span, original_value)
+
+      This enables reporting of custom errors involving source locations, such as when
+      using parsy as a :doc:`lexer</howto/lexing/>` or when building a syntax tree that will be
+      further analyzed.
+
 .. _operators:
 
 Parser operators
@@ -594,3 +612,26 @@ Parsy does not try to include every possible combinator - there is no reason why
 you cannot create your own for your needs using the built-in combinators and
 primitives. If you find something that is very generic and would be very useful
 to have as a built-in, please :doc:`submit </contributing>` as a PR!
+
+Auxiliary data structures
+=========================
+
+.. class:: SourceSpan
+
+   Identifies a span of material from the data being parsed by its start row and column and its end
+   row and column. If the data stream was equipped with a source, that value is also available in
+   this object.
+
+   .. attribute:: start
+
+      The starting position of this span as a tuple (row, col)
+
+   .. attribute:: end
+
+      The stopping position of this span as a tuple (row, col)
+
+   .. attribute:: source
+
+      The name of the source the data was parsed from. This is the same value
+      that was passed to :meth:`Parser.parse` or :meth:`Parser.parse_partial`,
+      or `None` if no value was given.
diff --git a/src/parsy/__init__.py b/src/parsy/__init__.py
@@ -12,13 +12,70 @@
 noop = lambda x: x
 
 
+class StrStream(str):
+    """String data to parse, possibly equipped with a name for the source it's
+    from, e.g. a file path."""
+
+    def __new__(cls, string, source):
+        instance = super().__new__(cls, string)
+        instance.source = source
+        return instance
+
+
+class ByteStream(str):
+    """String data to parse, possibly equipped with a name for the source it's
+    from, e.g. a file path."""
+
+    def __new__(cls, bs, source):
+        instance = super().__new__(cls, bs)
+        instance.source = source
+        return instance
+
+
+def make_stream(data: str | bytes, source: Any):
+    """Constructs an appropriate stream type for `data` when it's one of the
+    three core supported datatypes of parsy (viz. str, bytes, list). Otherwise,
+    the data is assumed to just support a minimum of __getitem__ and
+    __len__."""
+    if isinstance(data, str):
+        return StrStream(data, source)
+
+    if isinstance(data, bytes):
+        return ByteStream(data, source)
+
+    raise RuntimeError(
+        "A Parsy stream can be formed only on str and bytes, but the given "
+        f"data has type {type(data)}. If you are separately tokenizing the "
+        "data to parse, consider instead equipping the tokens with source "
+        "location metadata.",
+    )
+
+
+@dataclass
+class SourceSpan:
+    """Identifies a span of material from the data to parse.
+
+    Attributes:
+        source (str | None): the source of the data, e.g. a file path.
+        start ([int, int]): the start row and column of the span.
+        end ([int, int]): the end row and column of the span.
+    """
+
+    source: str | None
+    start: [int, int]
+    end: [int, int]
+
+
 def line_info_at(stream, index):
     if index > len(stream):
         raise ValueError("invalid index")
     line = stream.count("\n", 0, index)
     last_nl = stream.rfind("\n", 0, index)
     col = index - (last_nl + 1)
-    return (line, col)
+    if hasattr(stream, "source"):
+        return (line, col, stream.source)
+    else:
+        return (line, col)
 
 
 class ParseError(RuntimeError):
@@ -29,7 +86,15 @@ def __init__(self, expected, stream, index):
 
     def line_info(self):
         try:
-            return "{}:{}".format(*line_info_at(self.stream, self.index))
+            info = line_info_at(self.stream, self.index)
+            if len(info) == 2:
+                row, col = info
+                return f"{row}:{col}"
+            elif len(info) == 3:
+                source, row, col = info
+                return f"{source}:{row}:{col}"
+            else:
+                raise RuntimeError("Internal line_info_at violates length expectation.")
         except (TypeError, AttributeError):  # not a str
             return str(self.index)
 
@@ -90,20 +155,23 @@ def __init__(self, wrapped_fn: Callable[[str | bytes | list, int], Result]):
         """
         self.wrapped_fn = wrapped_fn
 
-    def __call__(self, stream: str | bytes | list, index: int):
+    def __call__(self, stream, index: int):
         return self.wrapped_fn(stream, index)
 
-    def parse(self, stream: str | bytes | list) -> Any:
+    def parse(self, stream, source=None) -> Any:
         """Parses a string or list of tokens and returns the result or raise a ParseError."""
-        (result, _) = (self << eof).parse_partial(stream)
+        (result, _) = (self << eof).parse_partial(stream, source)
         return result
 
-    def parse_partial(self, stream: str | bytes | list) -> tuple[Any, str | bytes | list]:
+    def parse_partial(self, stream, source=None) -> tuple[Any, str | bytes | list]:
         """
         Parses the longest possible prefix of a given string.
         Returns a tuple of the result and the unparsed remainder,
         or raises ParseError
         """
+        if source is not None:
+            stream = make_stream(stream, source)
+
         result = self(stream, 0)
 
         if result.status:
@@ -339,17 +407,47 @@ def mark(self) -> Parser:
         ((start_row, start_column),
          original_value,
          (end_row, end_column))
+
+        ``.span()'' is a more powerful version of this combinator, returning a
+        SourceSpan.
         """
 
         @generate
         def marked():
             start = yield line_info
             body = yield self
             end = yield line_info
+            # line_info returns a 3-tuple including the source when a source
+            # was given to `parse`, but older programs expect these tuples to
+            # have length 2, consisting of just row and col
+            start = start[:2]
+            end = end[:2]
             return (start, body, end)
 
         return marked
 
+    def span(self) -> Parser:
+        """
+        Returns a parser that augments the initial parser's result with a
+        SourceSpan capturing where that parser started and stopped.
+        The new value is a tuple:
+
+        (source_span, original_value)
+        """
+
+        @generate
+        def marked():
+            start = yield line_info
+            body = yield self
+            end = yield line_info
+            try:
+                source = start[2]
+            except IndexError:
+                source = None
+            return (SourceSpan(source, start[:2], end[:2]), body)
+
+        return marked
+
     def tag(self, name: str) -> Parser:
         """
         Returns a parser that wraps the produced value of the initial parser in a
@@ -578,8 +676,7 @@ def test_item(func: Callable[..., bool], description: str) -> Parser:
     def test_item_parser(stream, index):
         if index < len(stream):
             if isinstance(stream, bytes):
-                # Subscripting bytes with `[index]` instead of
-                # `[index:index + 1]` returns an int
+                # Otherwise directly indexing a bytes gives `int`
                 item = stream[index : index + 1]
             else:
                 item = stream[index]

diff --git a/tests/test_parsy.py b/tests/test_parsy.py
@@ -7,6 +7,7 @@
 
 from parsy import (
     ParseError,
+    SourceSpan,
     alt,
     any_char,
     char_from,
@@ -19,6 +20,7 @@
     letter,
     line_info,
     line_info_at,
+    make_stream,
     match_item,
     peek,
     regex,
@@ -208,6 +210,35 @@ def test_mark(self):
         self.assertEqual(letters, ["q", "w", "e", "r"])
         self.assertEqual(end, (1, 4))
 
+    def test_span(self):
+        parser = (letter.many().span() << string("\n")).many()
+        source = "sample"
+
+        lines = parser.parse("asdf\nqwer\n", source=source)
+
+        self.assertEqual(len(lines), 2)
+
+        (span, letters) = lines[0]
+        self.assertEqual(span, SourceSpan(source, (0, 0), (0, 4)))
+        self.assertEqual(letters, ["a", "s", "d", "f"])
+
+        (span, letters) = lines[1]
+        self.assertEqual(span, SourceSpan(source, (1, 0), (1, 4)))
+
+    def test_span_no_source(self):
+        parser = (letter.many().span() << string("\n")).many()
+
+        lines = parser.parse("asdf\nqwer\n")
+
+        self.assertEqual(len(lines), 2)
+
+        (span, letters) = lines[0]
+        self.assertEqual(span, SourceSpan(None, (0, 0), (0, 4)))
+        self.assertEqual(letters, ["a", "s", "d", "f"])
+
+        (span, letters) = lines[1]
+        self.assertEqual(span, SourceSpan(None, (1, 0), (1, 4)))
+
     def test_tag(self):
         parser = letter.many().concat().tag("word")
         self.assertEqual(
@@ -589,6 +620,18 @@ def foo():
             ],
         )
 
+        source = "aaaaa"
+        self.assertEqual(
+            foo.many().parse("AB\nCD", source=source),
+            [
+                ("A", (0, 0, source)),
+                ("B", (0, 1, source)),
+                ("\n", (0, 2, source)),
+                ("C", (1, 0, source)),
+                ("D", (1, 1, source)),
+            ],
+        )
+
     def test_should_fail(self):
         not_a_digit = digit.should_fail("not a digit") >> regex(r".*")
 
@@ -683,12 +726,23 @@ def foo():
 
 class TestUtils(unittest.TestCase):
     def test_line_info_at(self):
+
         text = "abc\ndef"
         self.assertEqual(line_info_at(text, 0), (0, 0))
         self.assertEqual(line_info_at(text, 2), (0, 2))
         self.assertEqual(line_info_at(text, 3), (0, 3))
         self.assertEqual(line_info_at(text, 4), (1, 0))
         self.assertEqual(line_info_at(text, 7), (1, 3))
+
+        self.assertRaises(ValueError, lambda: line_info_at(text, 8))
+
+        text = make_stream("abc\ndef", source="aaaa")
+        self.assertEqual(line_info_at(text, 0), (0, 0, "aaaa"))
+        self.assertEqual(line_info_at(text, 2), (0, 2, "aaaa"))
+        self.assertEqual(line_info_at(text, 3), (0, 3, "aaaa"))
+        self.assertEqual(line_info_at(text, 4), (1, 0, "aaaa"))
+        self.assertEqual(line_info_at(text, 7), (1, 3, "aaaa"))
+
         self.assertRaises(ValueError, lambda: line_info_at(text, 8))