feat(handler): add geom handler for uzip, lzma and zstd compression

rxpha3l · rxpha3l · commit 43b6fcf3b4d1 · 2025-03-28T15:45:37.000+01:00
Geom_uzip is a FreeBSD feature for creating compressed disk images (usually containing UFS). The compression is done in blocks, and the resulting .uzip file can be mounted via the GEOM framework on FreeBSD. The mkuzip header includes a table with block counts and sizes. The header declares the block size (size of decompressed blocks) and total number of blocks. Block size must be a multiple of 512 and defaults to 16384 in mkuzip. It has the following structure: > Magic, which is a shebang & compression identifier stored on 16 bytes. > Format, which is a shell command that provides some general information. > Block size, stored on 4 bytes. > Block count, stored on 4 bytes. > Table of content (TOC), which depends on the file lentgh. The TOC is a list of uint64_t offsets into the file for each block. To determine the length of a given block, read the next TOC entry and subtract the current offset from the next offset (this is why there is an extra TOC entry at the end). Each block is compressed using zlib. A standard zlib decompressor will decode them to a block of size block_size. Unblob parses the TOC to determine end & start offset of the compressed file. It detects the compression method (zlib, lzma or zstd). Finally the chunks are decompressed to revocer the inital file. Empty chunks are ignored, which is why the decompressed file with unlbob can be a little bit lighter than the original one. [Sources] https://github.com/mikeryan/unuzip https://www.baeldung.com/linux/filesystem-in-a-file https://docs.python.org/3/library/zlib.html https://github.com/freebsd/freebsd-src/blob/master/sys/geom/uzip/g_uzip.c https://parchive.sourceforge.net/docs/specifications/parity-volume-spec/article-spec.html https://www.mail-archive.com/dev-commits-src-main@freebsd.org/msg34955.html
diff --git a/overlay.nix b/overlay.nix
@@ -29,6 +29,8 @@ final: prev:
         ];
       };
 
+      dependencies = (super.dependencies or [ ]) ++ [ final.python3.pkgs.pyzstd ];
+
       # remove this when packaging changes are upstreamed
       cargoDeps = final.rustPlatform.importCargoLock {
         lockFile = ./Cargo.lock;
diff --git a/pyproject.toml b/pyproject.toml
@@ -19,6 +19,7 @@ dependencies = [
   "pyfatfs>=1.0.5",
   "pyperscan>=0.3.0",
   "python-magic>=0.4.27",
+  "pyzstd",
   "rarfile>=4.1",
   "rich>=13.3.5",
   "structlog>=24.1.0",
diff --git a/python/unblob/handlers/__init__.py b/python/unblob/handlers/__init__.py
@@ -31,6 +31,7 @@
     lzip,
     lzma,
     lzo,
+    uzip,
     xz,
     zlib,
     zstd,
@@ -116,6 +117,7 @@
     zlib.ZlibHandler,
     engenius.EngeniusHandler,
     ecc.AutelECCHandler,
+    uzip.UZIPHandler,
 )
 
 BUILTIN_DIR_HANDLERS: DirectoryHandlers = (
diff --git a/python/unblob/handlers/compression/uzip.py b/python/unblob/handlers/compression/uzip.py
@@ -0,0 +1,133 @@
+import lzma
+import re
+import zlib
+from pathlib import Path
+from typing import Callable, Optional
+
+import pyzstd
+
+from unblob.file_utils import (
+    Endian,
+    FileSystem,
+    InvalidInputFormat,
+    StructParser,
+    iterate_file,
+)
+from unblob.models import (
+    Extractor,
+    ExtractResult,
+    File,
+    Regex,
+    StructHandler,
+    ValidChunk,
+)
+
+# [Ref] https://github.com/freebsd/freebsd-src/tree/master/sys/geom/uzip
+C_DEFINITIONS = r"""
+    typedef struct uzip_header{
+        char magic[16];
+        char format[112];
+        uint32_t block_size;
+        uint32_t block_count;
+        uint64_t toc[block_count];
+    } uzip_header_t;
+"""
+
+HEADER_STRUCT = "uzip_header_t"
+
+ZLIB_COMPRESSION = "#!/bin/sh\x0a#V2.0\x20"
+LZMA_COMPRESSION = "#!/bin/sh\x0a#L3.0\x0a"
+ZSTD_COMPRESSION = "#!/bin/sh\x0a#Z4.0\x20"
+
+
+class Decompressor:
+    DECOMPRESSOR: Callable
+
+    def __init__(self):
+        self._decompressor = self.DECOMPRESSOR()
+
+    def decompress(self, data: bytes) -> bytes:
+        return self._decompressor.decompress(data)
+
+    def flush(self) -> bytes:
+        return b""
+
+
+class LZMADecompressor(Decompressor):
+    DECOMPRESSOR = lzma.LZMADecompressor
+
+
+class ZLIBDecompressor(Decompressor):
+    DECOMPRESSOR = zlib.decompressobj
+
+    def flush(self) -> bytes:
+        return self._decompressor.flush()
+
+
+class ZSTDDecompressor(Decompressor):
+    DECOMPRESSOR = pyzstd.EndlessZstdDecompressor
+
+
+DECOMPRESS_METHOD: dict[bytes, type[Decompressor]] = {
+    ZLIB_COMPRESSION.encode(): ZLIBDecompressor,
+    LZMA_COMPRESSION.encode(): LZMADecompressor,
+    ZSTD_COMPRESSION.encode(): ZSTDDecompressor,
+}
+
+
+class UZIPExtractor(Extractor):
+    def extract(self, inpath: Path, outdir: Path):
+        infile = File.from_path(inpath)
+        parser = StructParser(C_DEFINITIONS)
+        header = parser.parse(HEADER_STRUCT, infile, Endian.BIG)
+        fs = FileSystem(outdir)
+        outpath = Path(inpath.stem)
+
+        try:
+            decompressor_cls = DECOMPRESS_METHOD[header.magic]
+        except LookupError:
+            raise InvalidInputFormat("unsupported compression format") from None
+
+        with fs.open(outpath, "wb+") as outfile:
+            for current_offset, next_offset in zip(header.toc[:-1], header.toc[1:]):
+                compressed_len = next_offset - current_offset
+                if compressed_len == 0:
+                    continue
+                decompressor = decompressor_cls()
+                for chunk in iterate_file(infile, current_offset, compressed_len):
+                    outfile.write(decompressor.decompress(chunk))
+                outfile.write(decompressor.flush())
+        return ExtractResult(reports=fs.problems)
+
+
+class UZIPHandler(StructHandler):
+    NAME = "uzip"
+    PATTERNS = [
+        Regex(re.escape(ZLIB_COMPRESSION)),
+        Regex(re.escape(LZMA_COMPRESSION)),
+        Regex(re.escape(ZSTD_COMPRESSION)),
+    ]
+    HEADER_STRUCT = HEADER_STRUCT
+    C_DEFINITIONS = C_DEFINITIONS
+    EXTRACTOR = UZIPExtractor()
+
+    def is_valid_header(self, header) -> bool:
+        return (
+            header.block_count > 0
+            and header.block_size > 0
+            and header.block_size % 512 == 0
+        )
+
+    def calculate_chunk(self, file: File, start_offset: int) -> Optional[ValidChunk]:
+        header = self.parse_header(file, Endian.BIG)
+
+        if not self.is_valid_header(header):
+            raise InvalidInputFormat("Invalid uzip header.")
+
+        # take the last TOC block offset, end of file is that block offset,
+        # starting from the start offset
+        end_offset = start_offset + header.toc[-1]
+        return ValidChunk(
+            start_offset=start_offset,
+            end_offset=end_offset,
+        )
diff --git a/tests/integration/compression/uzip/lzma/__input__/myfs.img.ulzma b/tests/integration/compression/uzip/lzma/__input__/myfs.img.ulzma
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:bc53a5de25e6f5326564264fee9e1210067311c237d4d7a8299ebf244652cf05
+size 59392
diff --git a/tests/integration/compression/uzip/lzma/__output__/myfs.img.ulzma_extract/0-59316.uzip b/tests/integration/compression/uzip/lzma/__output__/myfs.img.ulzma_extract/0-59316.uzip
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:c8f164384ebee852bbdc6f5fabcf231fa5fc35d9f236c30e38b9746f871be122
+size 59316
diff --git a/tests/integration/compression/uzip/lzma/__output__/myfs.img.ulzma_extract/0-59316.uzip_extract/0-59316 b/tests/integration/compression/uzip/lzma/__output__/myfs.img.ulzma_extract/0-59316.uzip_extract/0-59316
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:e04449191a0c3eab172e5819c5c1e9c10a9cd2e4ffca2abacf065ac1e3bd1328
+size 458752
diff --git a/tests/integration/compression/uzip/lzma/__output__/myfs.img.ulzma_extract/59316-59392.padding b/tests/integration/compression/uzip/lzma/__output__/myfs.img.ulzma_extract/59316-59392.padding
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:f2c0d5456a983ecd12e314fcfa19879179fc8424343baeb1325457472ae85601
+size 76
diff --git a/tests/integration/compression/uzip/zlib/__input__/myfs.img.uzip b/tests/integration/compression/uzip/zlib/__input__/myfs.img.uzip
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:1e04c83a5444127b9ec58c4b2d8fef904816cee4609d798589d6e8af6086a322
+size 59904
diff --git a/tests/integration/compression/uzip/zlib/__input__/myfs.padded.img.uzip b/tests/integration/compression/uzip/zlib/__input__/myfs.padded.img.uzip
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:cc069dc850a564078858cc13dc743dc5772d1e28edb1ce8a714f5a8749b5d43d
+size 60032
diff --git a/tests/integration/compression/uzip/zlib/__output__/myfs.img.uzip_extract/0-59397.uzip b/tests/integration/compression/uzip/zlib/__output__/myfs.img.uzip_extract/0-59397.uzip
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:2ba6c3e09fa4b144f8f9fc29721c71df0bee753507c7071bdb8132409ce182d4
+size 59397
diff --git a/tests/integration/compression/uzip/zlib/__output__/myfs.img.uzip_extract/0-59397.uzip_extract/0-59397 b/tests/integration/compression/uzip/zlib/__output__/myfs.img.uzip_extract/0-59397.uzip_extract/0-59397
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:e04449191a0c3eab172e5819c5c1e9c10a9cd2e4ffca2abacf065ac1e3bd1328
+size 458752
diff --git a/tests/integration/compression/uzip/zlib/__output__/myfs.img.uzip_extract/59397-59904.padding b/tests/integration/compression/uzip/zlib/__output__/myfs.img.uzip_extract/59397-59904.padding
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:e48783451cfffa909b2c92ddb2b4c06b836aaa56f16aaab96349e8e9074d45b8
+size 507
diff --git a/tests/integration/compression/uzip/zlib/__output__/myfs.padded.img.uzip_extract/0-64.unknown b/tests/integration/compression/uzip/zlib/__output__/myfs.padded.img.uzip_extract/0-64.unknown
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:c33ff1723e6b94ae7e6a0ecad3c8a5fc43ab6f39468170f7467e11a8192f6164
+size 64
diff --git a/tests/integration/compression/uzip/zlib/__output__/myfs.padded.img.uzip_extract/59461-60032.unknown b/tests/integration/compression/uzip/zlib/__output__/myfs.padded.img.uzip_extract/59461-60032.unknown
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:7ad2f37110df3519cd58ede90a97a481853f2c9da95db4513d523f94aab9ca8c
+size 571
diff --git a/tests/integration/compression/uzip/zlib/__output__/myfs.padded.img.uzip_extract/64-59461.uzip b/tests/integration/compression/uzip/zlib/__output__/myfs.padded.img.uzip_extract/64-59461.uzip
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:2ba6c3e09fa4b144f8f9fc29721c71df0bee753507c7071bdb8132409ce182d4
+size 59397
diff --git a/tests/integration/compression/uzip/zlib/__output__/myfs.padded.img.uzip_extract/64-59461.uzip_extract/64-59461 b/tests/integration/compression/uzip/zlib/__output__/myfs.padded.img.uzip_extract/64-59461.uzip_extract/64-59461
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:e04449191a0c3eab172e5819c5c1e9c10a9cd2e4ffca2abacf065ac1e3bd1328
+size 458752
diff --git a/tests/integration/compression/uzip/zstd/__input__/myfs.img.uzst b/tests/integration/compression/uzip/zstd/__input__/myfs.img.uzst
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:0f8a24f2de9727324844a152716880a2cb29512d19918ade566811fa3a8ae8d1
+size 58368
diff --git a/tests/integration/compression/uzip/zstd/__output__/myfs.img.uzst_extract/0-58269.uzip b/tests/integration/compression/uzip/zstd/__output__/myfs.img.uzst_extract/0-58269.uzip
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:204c31af96edd20c6e074f58663a78106a8671930c76826938dcce4b9553d00e
+size 58269
diff --git a/tests/integration/compression/uzip/zstd/__output__/myfs.img.uzst_extract/0-58269.uzip_extract/0-58269 b/tests/integration/compression/uzip/zstd/__output__/myfs.img.uzst_extract/0-58269.uzip_extract/0-58269
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:e04449191a0c3eab172e5819c5c1e9c10a9cd2e4ffca2abacf065ac1e3bd1328
+size 458752
diff --git a/tests/integration/compression/uzip/zstd/__output__/myfs.img.uzst_extract/58269-58368.padding b/tests/integration/compression/uzip/zstd/__output__/myfs.img.uzst_extract/58269-58368.padding
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:4b298058e1d5fd3f2fa20ead21773912a5dc38da3c0da0bbc7de1adfb6011f1c
+size 99
diff --git a/uv.lock b/uv.lock

Original file line number	Diff line number	Diff line change
`@@ -0,0 +1,3 @@`
	`1`	`+version https://git-lfs.github.com/spec/v1`
	`2`	`+oid sha256:bc53a5de25e6f5326564264fee9e1210067311c237d4d7a8299ebf244652cf05`
	`3`	`+size 59392`
Original file line number	Diff line number	Diff line change
`@@ -0,0 +1,3 @@`
	`1`	`+version https://git-lfs.github.com/spec/v1`
	`2`	`+oid sha256:c8f164384ebee852bbdc6f5fabcf231fa5fc35d9f236c30e38b9746f871be122`
	`3`	`+size 59316`
Original file line number	Diff line number	Diff line change
`@@ -0,0 +1,3 @@`
	`1`	`+version https://git-lfs.github.com/spec/v1`
	`2`	`+oid sha256:e04449191a0c3eab172e5819c5c1e9c10a9cd2e4ffca2abacf065ac1e3bd1328`
	`3`	`+size 458752`
Original file line number	Diff line number	Diff line change
`@@ -0,0 +1,3 @@`
	`1`	`+version https://git-lfs.github.com/spec/v1`
	`2`	`+oid sha256:f2c0d5456a983ecd12e314fcfa19879179fc8424343baeb1325457472ae85601`
	`3`	`+size 76`
Original file line number	Diff line number	Diff line change
`@@ -0,0 +1,3 @@`
	`1`	`+version https://git-lfs.github.com/spec/v1`
	`2`	`+oid sha256:1e04c83a5444127b9ec58c4b2d8fef904816cee4609d798589d6e8af6086a322`
	`3`	`+size 59904`
Original file line number	Diff line number	Diff line change
`@@ -0,0 +1,3 @@`
	`1`	`+version https://git-lfs.github.com/spec/v1`
	`2`	`+oid sha256:cc069dc850a564078858cc13dc743dc5772d1e28edb1ce8a714f5a8749b5d43d`
	`3`	`+size 60032`
Original file line number	Diff line number	Diff line change
`@@ -0,0 +1,3 @@`
	`1`	`+version https://git-lfs.github.com/spec/v1`
	`2`	`+oid sha256:2ba6c3e09fa4b144f8f9fc29721c71df0bee753507c7071bdb8132409ce182d4`
	`3`	`+size 59397`
Original file line number	Diff line number	Diff line change
`@@ -0,0 +1,3 @@`
	`1`	`+version https://git-lfs.github.com/spec/v1`
	`2`	`+oid sha256:e48783451cfffa909b2c92ddb2b4c06b836aaa56f16aaab96349e8e9074d45b8`
	`3`	`+size 507`
Original file line number	Diff line number	Diff line change
`@@ -0,0 +1,3 @@`
	`1`	`+version https://git-lfs.github.com/spec/v1`
	`2`	`+oid sha256:c33ff1723e6b94ae7e6a0ecad3c8a5fc43ab6f39468170f7467e11a8192f6164`
	`3`	`+size 64`
Original file line number	Diff line number	Diff line change
`@@ -0,0 +1,3 @@`
	`1`	`+version https://git-lfs.github.com/spec/v1`
	`2`	`+oid sha256:7ad2f37110df3519cd58ede90a97a481853f2c9da95db4513d523f94aab9ca8c`
	`3`	`+size 571`