Merge pull request #1918 from braingram/junk_data

braingram · web-flow · commit 97a9fe49e449 · 2025-04-29T15:40:57.000-04:00
allow non-null post tree pre block padding
diff --git a/asdf/_block/reader.py b/asdf/_block/reader.py
@@ -2,7 +2,7 @@
 import weakref
 
 from asdf import constants
-from asdf.exceptions import AsdfBlockIndexWarning, AsdfWarning
+from asdf.exceptions import AsdfBlockIndexWarning, AsdfWarning, DelimiterNotFoundError
 
 from . import io as bio
 from .exceptions import BlockIndexError
@@ -122,55 +122,42 @@ def _read_blocks_serially(fd, memmap=False, lazy_load=False, validate_checksums=
     For parameter and return value descriptions see `read_blocks`.
     """
     blocks = []
-    buff = b""
     magic_len = len(constants.BLOCK_MAGIC)
-    while True:
-        # the expectation is that this will begin PRIOR to the block magic
-        # read 4 bytes
-        if not after_magic:
-            buff += fd.read(magic_len - len(buff))
-            if len(buff) == 0:
-                # we are done, there are no more blocks and no index
-                break
-            elif len(buff) < magic_len:
-                # we have less than magic_len bytes, this is likely an error
-                # in the input file/bytes
-                if all([b == 0 for b in buff]):
-                    # if these are all 0, assume this was a 'truncated' file
-                    # so don't issue a warning
-                    break
-                # if these are non-0 bytes issue a warning that the file
-                # is likely corrupt
-                msg = f"Read invalid bytes {buff!r} after blocks, your file might be corrupt"
-                warnings.warn(msg, AsdfWarning)
-                break
-
-        if buff == constants.INDEX_HEADER[:magic_len]:
-            # we hit the block index, which is not useful here
-            break
-
-        if after_magic or buff == constants.BLOCK_MAGIC:
-            # this is another block
-            offset, header, data_offset, data = bio.read_block(fd, memmap=memmap, lazy_load=lazy_load)
-            blocks.append(
-                ReadBlock(
-                    offset, fd, memmap, lazy_load, validate_checksums, header=header, data_offset=data_offset, data=data
-                )
+
+    if not after_magic:
+        # seek until the first magic is found
+        try:
+            fd.seek_until(b"(" + constants.BLOCK_MAGIC + b")", magic_len)
+        except DelimiterNotFoundError:
+            return blocks
+        after_magic = True
+
+    buff = constants.BLOCK_MAGIC
+    while buff == constants.BLOCK_MAGIC:
+        # read the block
+        offset, header, data_offset, data = bio.read_block(fd, memmap=memmap, lazy_load=lazy_load)
+        blocks.append(
+            ReadBlock(
+                offset, fd, memmap, lazy_load, validate_checksums, header=header, data_offset=data_offset, data=data
             )
-            if blocks[-1].header["flags"] & constants.BLOCK_FLAG_STREAMED:
-                # a file can only have 1 streamed block and it must be at the end so we
-                # can stop looking for more blocks
-                break
-            buff = b""
-            after_magic = False
-        else:
-            if len(blocks) or buff[0] != 0:
-                # if this is not the first block or we haven't found any
-                # blocks and the first byte is non-zero
-                msg = f"Invalid bytes while reading blocks {buff}"
-                raise OSError(msg)
-            # this is the first block, allow empty bytes before block
-            buff = buff.strip(b"\0")
+        )
+        if blocks[-1].header["flags"] & constants.BLOCK_FLAG_STREAMED:
+            # a file can only have 1 streamed block and it must be at the end so we
+            # can stop looking for more blocks
+            return blocks
+
+        # check for the next block
+        buff = fd.read(magic_len)
+
+    # check remaining bytes
+    if buff == constants.INDEX_HEADER[: len(buff)]:
+        # remaining bytes are the start of the block index
+        return blocks
+    if buff == b"\0" * len(buff):
+        # remaining bytes are null
+        return blocks
+    msg = f"Read invalid bytes {buff!r} after blocks, your file might be corrupt"
+    warnings.warn(msg, AsdfWarning)
     return blocks
 
 
diff --git a/asdf/_tests/_block/test_reader.py b/asdf/_tests/_block/test_reader.py
@@ -78,9 +78,18 @@ def test_read(tmp_path, lazy_load, memmap, with_index, validate_checksums, paddi
         assert r[0].cached_data is r[0].cached_data
 
 
-def test_read_invalid_padding():
-    with gen_blocks(padding=1, padding_byte=b"\1") as (fd, check):
-        with pytest.raises(OSError, match="Invalid bytes.*"):
+@pytest.mark.parametrize("padding", (1, 4, 7))
+@pytest.mark.parametrize("padding_byte", (b"\1", b"\0", b" ", b"\xd3", b"B", b"L", b"K", b"\xd3BL"))
+def test_read_valid_padding(padding, padding_byte):
+    """Test that reader allows padding bytes before the first block"""
+    with gen_blocks(padding=padding, padding_byte=padding_byte) as (fd, check):
+        check(read_blocks(fd))
+
+
+@pytest.mark.parametrize("padding_byte", (b"\xd3BLK", b" \xd3BLK"))
+def test_read_invalid_padding(padding_byte):
+    with gen_blocks(padding=1, padding_byte=padding_byte) as (fd, check):
+        with pytest.raises(ValueError, match="buffer is smaller than requested size"):
             check(read_blocks(fd))
 
 
diff --git a/changes/1918.bugfix.rst b/changes/1918.bugfix.rst
@@ -0,0 +1 @@
+Allow non-null bytes before the first byte.

Original file line number	Diff line number	Diff line change
`@@ -0,0 +1 @@`
	`1`	`+Allow non-null bytes before the first byte.`