Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
39 changes: 39 additions & 0 deletions docs/changelog.md
Original file line number Diff line number Diff line change
@@ -1,5 +1,44 @@
# Changelog

## v0.3.*

### v0.3.1 - 2025-07-28

[#6](https://github.com/p2p-ld/torrent-models/pull/6)
Add ability to get v1 and v2 byte ranges to validate partial data against.

The behavior differs somewhat significantly between v1 and v2, so we made separate implementations for both

- v1: {meth}`.Torrent.v1_piece_range` a piece may correspond to a range within a single file or across several, and may include padfiles that shouldn't really "exist" on a filesystem
- v2: {meth}`.Torrent.v2_piece_range` much simpler, a file either has a single root hash or a set of hashes from a lower level of the merkle tree, both are computed identically. pieces are always either a whole file or a part of a single file.

These correspond to the models returned, which both have a {meth}`~torrent_models.types.common.PieceRange.validate_data` method:

- v1: {class}`~torrent_models.types.v1.V1PieceRange`
- v2: {class}`~torrent_models.types.v2.V2PieceRange`

So we have two methods to get v1 and v2 ranges, which return a PieceRange object that can validate data passed to `validate_data`

so e.g. if we have a v1 torrent of 5 10KiB files of all zeros, and a piece size of 32 KiB, we might do somethign like this

```python
piece_range = torrent.v1_piece_range(0)
piece_range.validate_data([bytes(10), bytes(10), bytes(10), bytes(2)])
```

and v2 torrents work at the block level, as they usually do, so if we had a single-file v2 torrent with an empty 64 KiB file with a piece size of 64KiB, we would do

```python
piece_range = torrent.v2_piece_range('filename')
piece_range.validate_data([bytes(16 * KiB) for _ in range(4)])
```

#### Breaking

- changed the behavior of v2 piece layers dict to match v1 pieces:
when in memory, we split up the pieces into a list of hashes, rather than one bigass bytestring,
and then split again on serialization.

## v0.2.*

### v0.2.1 - 2025-07-27
Expand Down
175 changes: 157 additions & 18 deletions src/torrent_models/torrent.py
Original file line number Diff line number Diff line change
Expand Up @@ -32,7 +32,8 @@
UnixDatetime,
str_keys,
)
from torrent_models.types.v2 import FileTree
from torrent_models.types.v1 import FileItemRange, V1PieceRange
from torrent_models.types.v2 import FileTree, V2PieceRange


class TorrentBase(ConfiguredBase):
Expand Down Expand Up @@ -213,6 +214,158 @@ class Torrent(TorrentBase):
A valid torrent file, including hashes.
"""

@property
def file_size(self) -> int:
"""Size of the generated torrent file, in bytes"""
return len(self.bencode())

def bencode(self) -> bytes:
dumped = self.model_dump_torrent(mode="str")
return bencode_rs.bencode(dumped)

def write(self, path: Path) -> None:
"""Write the torrent to disk"""
with open(path, "wb") as f:
f.write(self.bencode())

def v1_piece_range(self, piece_idx: int) -> V1PieceRange:
"""Get a v1 piece range from the piece index"""
assert self.torrent_version in (
TorrentVersion.v1,
TorrentVersion.hybrid,
), "Cannot get v1 piece ranges for v2-only torrents"
self.info = cast(InfoDictV1 | InfoDictHybrid, self.info)
if piece_idx >= len(self.info.pieces):
raise IndexError(
f"Cannot get piece index {piece_idx} for torrent with "
f"{len(self.info.pieces)} pieces"
)

start_range = piece_idx * self.info.piece_length
end_range = (piece_idx + 1) * self.info.piece_length

if self.info.files is None:
self.info.length = cast(int, self.info.length)
# single file torrent
return V1PieceRange(
piece_idx=piece_idx,
piece_hash=self.info.pieces[piece_idx],
ranges=[
FileItemRange(
path=[self.info.name],
length=self.info.length,
range_start=start_range,
range_end=min(self.info.length, end_range),
)
],
)

size_idx = 0
file_idx = 0
found_len = 0
ranges = []
# first, find file where range starts
# could probably be combined with the second step,
# but just getting this working before worrying about aesthetics
for i, file in enumerate(self.info.files):
if file.length + size_idx > start_range:
# range starts in this file
# create the range from the first file
file_range_start = start_range % size_idx if size_idx > 0 else start_range
file_range_end = min(file.length, file_range_start + self.info.piece_length)
found_len += file_range_end - file_range_start
ranges.append(
FileItemRange(
path=file.path,
attr=file.attr,
length=file.length,
range_start=file_range_start,
range_end=file_range_end,
)
)

# index additional files starting at the next file
file_idx = i + 1
break
else:
size_idx += file.length

# then, iterate through files until the range or files are exhausted
while found_len < self.info.piece_length and file_idx < len(self.info.files):
file = self.info.files[file_idx]
file_range_start = 0
file_range_end = min(file.length, self.info.piece_length - found_len)

ranges.append(
FileItemRange(
path=file.path,
attr=file.attr,
length=file.length,
range_start=file_range_start,
range_end=file_range_end,
)
)
found_len += file_range_end - file_range_start
file_idx += 1
return V1PieceRange(
piece_idx=piece_idx, ranges=ranges, piece_hash=self.info.pieces[piece_idx]
)

def v2_piece_range(self, file: str, piece_idx: int = 0) -> V2PieceRange:
"""
Get a v2 piece range from a file path and optional piece index.

If `piece_idx` is not provided (default to 0)...

- If the file is larger than the piece length, gets the 0th piece.
- If the file is smaller than the piece length,
the range corresponds to the whole file, the hash is the root hash,
and piece_idx is ignored.
"""
assert self.torrent_version in (
TorrentVersion.v2,
TorrentVersion.hybrid,
), "Cannot get v2 piece ranges from a v1-only torrent"

# satisfy mypy...
self.info = cast(InfoDictV2 | InfoDictHybrid, self.info)
flat_files = self.flat_files
flat_files = cast(dict[str, FileTreeItem], flat_files)
self.piece_layers = cast(PieceLayersType, self.piece_layers)

if file not in flat_files:
raise ValueError(f"file {file} not found in torrent!")

root = flat_files[file]["pieces root"]

if root not in self.piece_layers:
# smaller then piece_length, piece range is whole file
return V2PieceRange(
piece_idx=0,
path=file,
range_start=0,
range_end=flat_files[file]["length"],
piece_length=self.info.piece_length,
file_size=flat_files[file]["length"],
root_hash=root,
)
else:
if piece_idx >= len(self.piece_layers[root]):
raise IndexError(
f"piece index {piece_idx} is out of range for file with "
f"{len(self.piece_layers[root])} pieces"
)
return V2PieceRange(
piece_idx=piece_idx,
path=file,
range_start=piece_idx * self.info.piece_length,
range_end=min(flat_files[file]["length"], (piece_idx + 1) * self.info.piece_length),
piece_length=self.info.piece_length,
file_size=flat_files[file]["length"],
piece_hash=self.piece_layers[root][piece_idx],
root_hash=root,
)

@model_validator(mode="after")
def piece_layers_if_v2(self) -> Self:
"""If we are a v2 or hybrid torrent, we should have piece layers"""
Expand All @@ -228,7 +381,7 @@ def pieces_layers_correct(self) -> Self:
"""
if self.torrent_version == TorrentVersion.v1:
return self
self.piece_layers = cast(dict[bytes, bytes], self.piece_layers)
self.piece_layers = cast(PieceLayersType, self.piece_layers)
self.info = cast(InfoDictV2 | InfoDictHybrid, self.info)
for path, file_info in self.info.flat_tree.items():
if file_info["length"] > self.info.piece_length:
Expand All @@ -237,28 +390,14 @@ def pieces_layers_correct(self) -> Self:
f"Expected to find: {file_info['pieces root']}" # type: ignore
)
expected_pieces = ceil(file_info["length"] / self.info.piece_length)
assert len(self.piece_layers[file_info["pieces root"]]) == expected_pieces * 32, (
assert len(self.piece_layers[file_info["pieces root"]]) == expected_pieces, (
f"File {path} does not have the correct number of piece hashes. "
f"Expected {expected_pieces} hashes from file length {file_info['length']} "
f"and piece length {self.info.piece_length}. "
f"Got {len(self.piece_layers[file_info['pieces root']]) / 32}"
f"Got {len(self.piece_layers[file_info['pieces root']])}"
)
return self

def bencode(self) -> bytes:
dumped = self.model_dump_torrent(mode="str")
return bencode_rs.bencode(dumped)

def write(self, path: Path) -> None:
"""Write the torrent to disk"""
with open(path, "wb") as f:
f.write(self.bencode())

@property
def file_size(self) -> int:
"""Size of the generated torrent file, in bytes"""
return len(self.bencode())


def pprint(t: TorrentBase, verbose: int = 0) -> None:
"""
Expand Down
21 changes: 20 additions & 1 deletion src/torrent_models/types/common.py
Original file line number Diff line number Diff line change
@@ -1,10 +1,11 @@
import sys
from abc import abstractmethod
from enum import StrEnum
from pathlib import Path
from typing import Annotated, NotRequired, TypeAlias

from annotated_types import Ge, Len
from pydantic import AfterValidator, AnyUrl, Field
from pydantic import AfterValidator, AnyUrl, BaseModel, Field

from torrent_models.base import ConfiguredBase
from torrent_models.types.serdes import ByteStr
Expand Down Expand Up @@ -81,3 +82,21 @@ class GenericFileItem(ConfiguredBase):
length: Annotated[int, Ge(0)]
attr: bytes | None = None
pieces_root: bytes | None = Field(None, alias="pieces root")


class PieceRange(BaseModel):
"""
Parent model for v1 and v2 piece ranges.

Piece ranges provide some description of paths and byte ranges that correspond to a single
verifiable piece and a method for verifying data against them.

Since v1 and v2 data models are substantially different,
their sub-models are also quite different, but provide a common interface through this ABC
"""

piece_idx: int

@abstractmethod
def validate_data(self, data: list[bytes]) -> bool:
"""Check that the provided data matches the piece or root hash"""
40 changes: 39 additions & 1 deletion src/torrent_models/types/v1.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@
Types used only in v1 (and hybrid) torrents
"""

import hashlib
from typing import Annotated, Self

from annotated_types import Ge
Expand All @@ -15,7 +16,7 @@
from pydantic_core.core_schema import SerializationInfo

from torrent_models.base import ConfiguredBase
from torrent_models.types.common import FilePart, SHA1Hash, _power_of_two
from torrent_models.types.common import FilePart, PieceRange, SHA1Hash, _power_of_two

V1PieceLength = Annotated[int, AfterValidator(_power_of_two)]
"""
Expand Down Expand Up @@ -78,3 +79,40 @@ def strict_padfile_naming(self, info: ValidationInfo) -> Self:
str(self.length),
], "strict mode - padfiles must be named `.pad/{length}`"
return self


class FileItemRange(FileItem):
"""A File Item with a byte range, for use with V1PieceRange"""

range_start: int
range_end: int


class V1PieceRange(PieceRange):
"""
Paths and byte ranges that correspond to a single v1
"""

ranges: list[FileItemRange]
piece_hash: SHA1Hash

def validate_data(self, data: list[bytes]) -> bool:
"""
Validate data against hash by concatenating bytes and comparing the SHA1 hash

The user is responsible for providing all-zero bytestrings
for any padding files in the indicated ranges
"""
assert len(data) == len(
self.ranges
), "Need to provide data chunks that correspond to each of the indicated file ranges"
for range, d in zip(self.ranges, data):
assert (range.range_end - range.range_start) == len(d), (
"Provided data chunks must match the sizes indicated by the "
"start and end ranges of each file range"
)

hasher = hashlib.new("sha1")
for d in data:
hasher.update(d)
return self.piece_hash == hasher.digest()
Loading
Loading