p2p-ld · sneakers-the-rat · Jul 29, 2025 · Jul 29, 2025 · Jul 29, 2025 · Jul 29, 2025
diff --git a/docs/changelog.md b/docs/changelog.md
@@ -1,5 +1,44 @@
 # Changelog
 
+## v0.3.*
+
+### v0.3.1 - 2025-07-28
+
+[#6](https://github.com/p2p-ld/torrent-models/pull/6) 
+Add ability to get v1 and v2 byte ranges to validate partial data against.
+
+The behavior differs somewhat significantly between v1 and v2, so we made separate implementations for both
+
+- v1: {meth}`.Torrent.v1_piece_range` a piece may correspond to a range within a single file or across several, and may include padfiles that shouldn't really "exist" on a filesystem
+- v2: {meth}`.Torrent.v2_piece_range` much simpler, a file either has a single root hash or a set of hashes from a lower level of the merkle tree, both are computed identically. pieces are always either a whole file or a part of a single file.
+
+These correspond to the models returned, which both have a {meth}`~torrent_models.types.common.PieceRange.validate_data` method:
+
+- v1: {class}`~torrent_models.types.v1.V1PieceRange`
+- v2: {class}`~torrent_models.types.v2.V2PieceRange`
+
+So we have two methods to get v1 and v2 ranges, which return a PieceRange object that can validate data passed to `validate_data`
+
+so e.g. if we have a v1 torrent of 5 10KiB files of all zeros, and a piece size of 32 KiB, we might do somethign like this
+
+```python
+piece_range = torrent.v1_piece_range(0)
+piece_range.validate_data([bytes(10), bytes(10), bytes(10), bytes(2)])
+```
+
+and v2 torrents work at the block level, as they usually do, so if we had a single-file v2 torrent with an empty 64 KiB file with a piece size of 64KiB, we would do
+
+```python
+piece_range = torrent.v2_piece_range('filename')
+piece_range.validate_data([bytes(16 * KiB) for _ in range(4)])
+```
+
+#### Breaking
+
+- changed the behavior of v2 piece layers dict to match v1 pieces: 
+  when in memory, we split up the pieces into a list of hashes, rather than one bigass bytestring, 
+  and then split again on serialization.
+
 ## v0.2.*
 
 ### v0.2.1 - 2025-07-27

diff --git a/src/torrent_models/torrent.py b/src/torrent_models/torrent.py
@@ -32,7 +32,8 @@
     UnixDatetime,
     str_keys,
 )
-from torrent_models.types.v2 import FileTree
+from torrent_models.types.v1 import FileItemRange, V1PieceRange
+from torrent_models.types.v2 import FileTree, V2PieceRange
 
 
 class TorrentBase(ConfiguredBase):
@@ -213,6 +214,158 @@ class Torrent(TorrentBase):
     A valid torrent file, including hashes.
     """
 
+    @property
+    def file_size(self) -> int:
+        """Size of the generated torrent file, in bytes"""
+        return len(self.bencode())
+
+    def bencode(self) -> bytes:
+        dumped = self.model_dump_torrent(mode="str")
+        return bencode_rs.bencode(dumped)
+
+    def write(self, path: Path) -> None:
+        """Write the torrent to disk"""
+        with open(path, "wb") as f:
+            f.write(self.bencode())
+
+    def v1_piece_range(self, piece_idx: int) -> V1PieceRange:
+        """Get a v1 piece range from the piece index"""
+        assert self.torrent_version in (
+            TorrentVersion.v1,
+            TorrentVersion.hybrid,
+        ), "Cannot get v1 piece ranges for v2-only torrents"
+        self.info = cast(InfoDictV1 | InfoDictHybrid, self.info)
+        if piece_idx >= len(self.info.pieces):
+            raise IndexError(
+                f"Cannot get piece index {piece_idx} for torrent with "
+                f"{len(self.info.pieces)} pieces"
+            )
+
+        start_range = piece_idx * self.info.piece_length
+        end_range = (piece_idx + 1) * self.info.piece_length
+
+        if self.info.files is None:
+            self.info.length = cast(int, self.info.length)
+            # single file torrent
+            return V1PieceRange(
+                piece_idx=piece_idx,
+                piece_hash=self.info.pieces[piece_idx],
+                ranges=[
+                    FileItemRange(
+                        path=[self.info.name],
+                        length=self.info.length,
+                        range_start=start_range,
+                        range_end=min(self.info.length, end_range),
+                    )
+                ],
+            )
+
+        size_idx = 0
+        file_idx = 0
+        found_len = 0
+        ranges = []
+        # first, find file where range starts
+        # could probably be combined with the second step,
+        # but just getting this working before worrying about aesthetics
+        for i, file in enumerate(self.info.files):
+            if file.length + size_idx > start_range:
+                # range starts in this file
+                # create the range from the first file
+                file_range_start = start_range % size_idx if size_idx > 0 else start_range
+                file_range_end = min(file.length, file_range_start + self.info.piece_length)
+                found_len += file_range_end - file_range_start
+                ranges.append(
+                    FileItemRange(
+                        path=file.path,
+                        attr=file.attr,
+                        length=file.length,
+                        range_start=file_range_start,
+                        range_end=file_range_end,
+                    )
+                )
+
+                # index additional files starting at the next file
+                file_idx = i + 1
+                break
+            else:
+                size_idx += file.length
+
+        # then, iterate through files until the range or files are exhausted
+        while found_len < self.info.piece_length and file_idx < len(self.info.files):
+            file = self.info.files[file_idx]
+            file_range_start = 0
+            file_range_end = min(file.length, self.info.piece_length - found_len)
+
+            ranges.append(
+                FileItemRange(
+                    path=file.path,
+                    attr=file.attr,
+                    length=file.length,
+                    range_start=file_range_start,
+                    range_end=file_range_end,
+                )
+            )
+            found_len += file_range_end - file_range_start
+            file_idx += 1
+        return V1PieceRange(
+            piece_idx=piece_idx, ranges=ranges, piece_hash=self.info.pieces[piece_idx]
+        )
+
+    def v2_piece_range(self, file: str, piece_idx: int = 0) -> V2PieceRange:
+        """
+        Get a v2 piece range from a file path and optional piece index.
+
+        If `piece_idx` is not provided (default to 0)...
+
+        - If the file is larger than the piece length, gets the 0th piece.
+        - If the file is smaller than the piece length,
+          the range corresponds to the whole file, the hash is the root hash,
+          and piece_idx is ignored.
+        """
+        assert self.torrent_version in (
+            TorrentVersion.v2,
+            TorrentVersion.hybrid,
+        ), "Cannot get v2 piece ranges from a v1-only torrent"
+
+        # satisfy mypy...
+        self.info = cast(InfoDictV2 | InfoDictHybrid, self.info)
+        flat_files = self.flat_files
+        flat_files = cast(dict[str, FileTreeItem], flat_files)
+        self.piece_layers = cast(PieceLayersType, self.piece_layers)
+
+        if file not in flat_files:
+            raise ValueError(f"file {file} not found in torrent!")
+
+        root = flat_files[file]["pieces root"]
+
+        if root not in self.piece_layers:
+            # smaller then piece_length, piece range is whole file
+            return V2PieceRange(
+                piece_idx=0,
+                path=file,
+                range_start=0,
+                range_end=flat_files[file]["length"],
+                piece_length=self.info.piece_length,
+                file_size=flat_files[file]["length"],
+                root_hash=root,
+            )
+        else:
+            if piece_idx >= len(self.piece_layers[root]):
+                raise IndexError(
+                    f"piece index {piece_idx} is out of range for file with "
+                    f"{len(self.piece_layers[root])} pieces"
+                )
+            return V2PieceRange(
+                piece_idx=piece_idx,
+                path=file,
+                range_start=piece_idx * self.info.piece_length,
+                range_end=min(flat_files[file]["length"], (piece_idx + 1) * self.info.piece_length),
+                piece_length=self.info.piece_length,
+                file_size=flat_files[file]["length"],
+                piece_hash=self.piece_layers[root][piece_idx],
+                root_hash=root,
+            )
+
     @model_validator(mode="after")
     def piece_layers_if_v2(self) -> Self:
         """If we are a v2 or hybrid torrent, we should have piece layers"""
@@ -228,7 +381,7 @@ def pieces_layers_correct(self) -> Self:
         """
         if self.torrent_version == TorrentVersion.v1:
             return self
-        self.piece_layers = cast(dict[bytes, bytes], self.piece_layers)
+        self.piece_layers = cast(PieceLayersType, self.piece_layers)
         self.info = cast(InfoDictV2 | InfoDictHybrid, self.info)
         for path, file_info in self.info.flat_tree.items():
             if file_info["length"] > self.info.piece_length:
@@ -237,28 +390,14 @@ def pieces_layers_correct(self) -> Self:
                     f"Expected to find: {file_info['pieces root']}"  # type: ignore
                 )
                 expected_pieces = ceil(file_info["length"] / self.info.piece_length)
-                assert len(self.piece_layers[file_info["pieces root"]]) == expected_pieces * 32, (
+                assert len(self.piece_layers[file_info["pieces root"]]) == expected_pieces, (
                     f"File {path} does not have the correct number of piece hashes. "
                     f"Expected {expected_pieces} hashes from file length {file_info['length']} "
                     f"and piece length {self.info.piece_length}. "
-                    f"Got {len(self.piece_layers[file_info['pieces root']]) / 32}"
+                    f"Got {len(self.piece_layers[file_info['pieces root']])}"
                 )
         return self
 
-    def bencode(self) -> bytes:
-        dumped = self.model_dump_torrent(mode="str")
-        return bencode_rs.bencode(dumped)
-
-    def write(self, path: Path) -> None:
-        """Write the torrent to disk"""
-        with open(path, "wb") as f:
-            f.write(self.bencode())
-
-    @property
-    def file_size(self) -> int:
-        """Size of the generated torrent file, in bytes"""
-        return len(self.bencode())
-
 
 def pprint(t: TorrentBase, verbose: int = 0) -> None:
     """

diff --git a/src/torrent_models/types/common.py b/src/torrent_models/types/common.py
@@ -1,10 +1,11 @@
 import sys
+from abc import abstractmethod
 from enum import StrEnum
 from pathlib import Path
 from typing import Annotated, NotRequired, TypeAlias
 
 from annotated_types import Ge, Len
-from pydantic import AfterValidator, AnyUrl, Field
+from pydantic import AfterValidator, AnyUrl, BaseModel, Field
 
 from torrent_models.base import ConfiguredBase
 from torrent_models.types.serdes import ByteStr
@@ -81,3 +82,21 @@ class GenericFileItem(ConfiguredBase):
     length: Annotated[int, Ge(0)]
     attr: bytes | None = None
     pieces_root: bytes | None = Field(None, alias="pieces root")
+
+
+class PieceRange(BaseModel):
+    """
+    Parent model for v1 and v2 piece ranges.
+
+    Piece ranges provide some description of paths and byte ranges that correspond to a single
+    verifiable piece and a method for verifying data against them.
+
+    Since v1 and v2 data models are substantially different,
+    their sub-models are also quite different, but provide a common interface through this ABC
+    """
+
+    piece_idx: int
+
+    @abstractmethod
+    def validate_data(self, data: list[bytes]) -> bool:
+        """Check that the provided data matches the piece or root hash"""
diff --git a/src/torrent_models/types/v1.py b/src/torrent_models/types/v1.py
@@ -2,6 +2,7 @@
 Types used only in v1 (and hybrid) torrents
 """
 
+import hashlib
 from typing import Annotated, Self
 
 from annotated_types import Ge
@@ -15,7 +16,7 @@
 from pydantic_core.core_schema import SerializationInfo
 
 from torrent_models.base import ConfiguredBase
-from torrent_models.types.common import FilePart, SHA1Hash, _power_of_two
+from torrent_models.types.common import FilePart, PieceRange, SHA1Hash, _power_of_two
 
 V1PieceLength = Annotated[int, AfterValidator(_power_of_two)]
 """
@@ -78,3 +79,40 @@ def strict_padfile_naming(self, info: ValidationInfo) -> Self:
                 str(self.length),
             ], "strict mode - padfiles must be named `.pad/{length}`"
         return self
+
+
+class FileItemRange(FileItem):
+    """A File Item with a byte range, for use with V1PieceRange"""
+
+    range_start: int
+    range_end: int
+
+
+class V1PieceRange(PieceRange):
+    """
+    Paths and byte ranges that correspond to a single v1
+    """
+
+    ranges: list[FileItemRange]
+    piece_hash: SHA1Hash
+
+    def validate_data(self, data: list[bytes]) -> bool:
+        """
+        Validate data against hash by concatenating bytes and comparing the SHA1 hash
+
+        The user is responsible for providing all-zero bytestrings
+        for any padding files in the indicated ranges
+        """
+        assert len(data) == len(
+            self.ranges
+        ), "Need to provide data chunks that correspond to each of the indicated file ranges"
+        for range, d in zip(self.ranges, data):
+            assert (range.range_end - range.range_start) == len(d), (
+                "Provided data chunks must match the sizes indicated by the "
+                "start and end ranges of each file range"
+            )
+
+        hasher = hashlib.new("sha1")
+        for d in data:
+            hasher.update(d)
+        return self.piece_hash == hasher.digest()