Skip to content

Commit c8b2db6

Browse files
Merge pull request #6 from p2p-ld/byte-ranges
Add: get byte ranges from piece indexes
2 parents 4a63d2a + 70f4914 commit c8b2db6

File tree

9 files changed

+424
-40
lines changed

9 files changed

+424
-40
lines changed

docs/changelog.md

Lines changed: 39 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,44 @@
11
# Changelog
22

3+
## v0.3.*
4+
5+
### v0.3.1 - 2025-07-28
6+
7+
[#6](https://github.com/p2p-ld/torrent-models/pull/6)
8+
Add ability to get v1 and v2 byte ranges to validate partial data against.
9+
10+
The behavior differs somewhat significantly between v1 and v2, so we made separate implementations for both
11+
12+
- v1: {meth}`.Torrent.v1_piece_range` a piece may correspond to a range within a single file or across several, and may include padfiles that shouldn't really "exist" on a filesystem
13+
- v2: {meth}`.Torrent.v2_piece_range` much simpler, a file either has a single root hash or a set of hashes from a lower level of the merkle tree, both are computed identically. pieces are always either a whole file or a part of a single file.
14+
15+
These correspond to the models returned, which both have a {meth}`~torrent_models.types.common.PieceRange.validate_data` method:
16+
17+
- v1: {class}`~torrent_models.types.v1.V1PieceRange`
18+
- v2: {class}`~torrent_models.types.v2.V2PieceRange`
19+
20+
So we have two methods to get v1 and v2 ranges, which return a PieceRange object that can validate data passed to `validate_data`
21+
22+
so e.g. if we have a v1 torrent of 5 10KiB files of all zeros, and a piece size of 32 KiB, we might do somethign like this
23+
24+
```python
25+
piece_range = torrent.v1_piece_range(0)
26+
piece_range.validate_data([bytes(10), bytes(10), bytes(10), bytes(2)])
27+
```
28+
29+
and v2 torrents work at the block level, as they usually do, so if we had a single-file v2 torrent with an empty 64 KiB file with a piece size of 64KiB, we would do
30+
31+
```python
32+
piece_range = torrent.v2_piece_range('filename')
33+
piece_range.validate_data([bytes(16 * KiB) for _ in range(4)])
34+
```
35+
36+
#### Breaking
37+
38+
- changed the behavior of v2 piece layers dict to match v1 pieces:
39+
when in memory, we split up the pieces into a list of hashes, rather than one bigass bytestring,
40+
and then split again on serialization.
41+
342
## v0.2.*
443

544
### v0.2.1 - 2025-07-27

src/torrent_models/torrent.py

Lines changed: 157 additions & 18 deletions
Original file line numberDiff line numberDiff line change
@@ -32,7 +32,8 @@
3232
UnixDatetime,
3333
str_keys,
3434
)
35-
from torrent_models.types.v2 import FileTree
35+
from torrent_models.types.v1 import FileItemRange, V1PieceRange
36+
from torrent_models.types.v2 import FileTree, V2PieceRange
3637

3738

3839
class TorrentBase(ConfiguredBase):
@@ -213,6 +214,158 @@ class Torrent(TorrentBase):
213214
A valid torrent file, including hashes.
214215
"""
215216

217+
@property
218+
def file_size(self) -> int:
219+
"""Size of the generated torrent file, in bytes"""
220+
return len(self.bencode())
221+
222+
def bencode(self) -> bytes:
223+
dumped = self.model_dump_torrent(mode="str")
224+
return bencode_rs.bencode(dumped)
225+
226+
def write(self, path: Path) -> None:
227+
"""Write the torrent to disk"""
228+
with open(path, "wb") as f:
229+
f.write(self.bencode())
230+
231+
def v1_piece_range(self, piece_idx: int) -> V1PieceRange:
232+
"""Get a v1 piece range from the piece index"""
233+
assert self.torrent_version in (
234+
TorrentVersion.v1,
235+
TorrentVersion.hybrid,
236+
), "Cannot get v1 piece ranges for v2-only torrents"
237+
self.info = cast(InfoDictV1 | InfoDictHybrid, self.info)
238+
if piece_idx >= len(self.info.pieces):
239+
raise IndexError(
240+
f"Cannot get piece index {piece_idx} for torrent with "
241+
f"{len(self.info.pieces)} pieces"
242+
)
243+
244+
start_range = piece_idx * self.info.piece_length
245+
end_range = (piece_idx + 1) * self.info.piece_length
246+
247+
if self.info.files is None:
248+
self.info.length = cast(int, self.info.length)
249+
# single file torrent
250+
return V1PieceRange(
251+
piece_idx=piece_idx,
252+
piece_hash=self.info.pieces[piece_idx],
253+
ranges=[
254+
FileItemRange(
255+
path=[self.info.name],
256+
length=self.info.length,
257+
range_start=start_range,
258+
range_end=min(self.info.length, end_range),
259+
)
260+
],
261+
)
262+
263+
size_idx = 0
264+
file_idx = 0
265+
found_len = 0
266+
ranges = []
267+
# first, find file where range starts
268+
# could probably be combined with the second step,
269+
# but just getting this working before worrying about aesthetics
270+
for i, file in enumerate(self.info.files):
271+
if file.length + size_idx > start_range:
272+
# range starts in this file
273+
# create the range from the first file
274+
file_range_start = start_range % size_idx if size_idx > 0 else start_range
275+
file_range_end = min(file.length, file_range_start + self.info.piece_length)
276+
found_len += file_range_end - file_range_start
277+
ranges.append(
278+
FileItemRange(
279+
path=file.path,
280+
attr=file.attr,
281+
length=file.length,
282+
range_start=file_range_start,
283+
range_end=file_range_end,
284+
)
285+
)
286+
287+
# index additional files starting at the next file
288+
file_idx = i + 1
289+
break
290+
else:
291+
size_idx += file.length
292+
293+
# then, iterate through files until the range or files are exhausted
294+
while found_len < self.info.piece_length and file_idx < len(self.info.files):
295+
file = self.info.files[file_idx]
296+
file_range_start = 0
297+
file_range_end = min(file.length, self.info.piece_length - found_len)
298+
299+
ranges.append(
300+
FileItemRange(
301+
path=file.path,
302+
attr=file.attr,
303+
length=file.length,
304+
range_start=file_range_start,
305+
range_end=file_range_end,
306+
)
307+
)
308+
found_len += file_range_end - file_range_start
309+
file_idx += 1
310+
return V1PieceRange(
311+
piece_idx=piece_idx, ranges=ranges, piece_hash=self.info.pieces[piece_idx]
312+
)
313+
314+
def v2_piece_range(self, file: str, piece_idx: int = 0) -> V2PieceRange:
315+
"""
316+
Get a v2 piece range from a file path and optional piece index.
317+
318+
If `piece_idx` is not provided (default to 0)...
319+
320+
- If the file is larger than the piece length, gets the 0th piece.
321+
- If the file is smaller than the piece length,
322+
the range corresponds to the whole file, the hash is the root hash,
323+
and piece_idx is ignored.
324+
"""
325+
assert self.torrent_version in (
326+
TorrentVersion.v2,
327+
TorrentVersion.hybrid,
328+
), "Cannot get v2 piece ranges from a v1-only torrent"
329+
330+
# satisfy mypy...
331+
self.info = cast(InfoDictV2 | InfoDictHybrid, self.info)
332+
flat_files = self.flat_files
333+
flat_files = cast(dict[str, FileTreeItem], flat_files)
334+
self.piece_layers = cast(PieceLayersType, self.piece_layers)
335+
336+
if file not in flat_files:
337+
raise ValueError(f"file {file} not found in torrent!")
338+
339+
root = flat_files[file]["pieces root"]
340+
341+
if root not in self.piece_layers:
342+
# smaller then piece_length, piece range is whole file
343+
return V2PieceRange(
344+
piece_idx=0,
345+
path=file,
346+
range_start=0,
347+
range_end=flat_files[file]["length"],
348+
piece_length=self.info.piece_length,
349+
file_size=flat_files[file]["length"],
350+
root_hash=root,
351+
)
352+
else:
353+
if piece_idx >= len(self.piece_layers[root]):
354+
raise IndexError(
355+
f"piece index {piece_idx} is out of range for file with "
356+
f"{len(self.piece_layers[root])} pieces"
357+
)
358+
return V2PieceRange(
359+
piece_idx=piece_idx,
360+
path=file,
361+
range_start=piece_idx * self.info.piece_length,
362+
range_end=min(flat_files[file]["length"], (piece_idx + 1) * self.info.piece_length),
363+
piece_length=self.info.piece_length,
364+
file_size=flat_files[file]["length"],
365+
piece_hash=self.piece_layers[root][piece_idx],
366+
root_hash=root,
367+
)
368+
216369
@model_validator(mode="after")
217370
def piece_layers_if_v2(self) -> Self:
218371
"""If we are a v2 or hybrid torrent, we should have piece layers"""
@@ -228,7 +381,7 @@ def pieces_layers_correct(self) -> Self:
228381
"""
229382
if self.torrent_version == TorrentVersion.v1:
230383
return self
231-
self.piece_layers = cast(dict[bytes, bytes], self.piece_layers)
384+
self.piece_layers = cast(PieceLayersType, self.piece_layers)
232385
self.info = cast(InfoDictV2 | InfoDictHybrid, self.info)
233386
for path, file_info in self.info.flat_tree.items():
234387
if file_info["length"] > self.info.piece_length:
@@ -237,28 +390,14 @@ def pieces_layers_correct(self) -> Self:
237390
f"Expected to find: {file_info['pieces root']}" # type: ignore
238391
)
239392
expected_pieces = ceil(file_info["length"] / self.info.piece_length)
240-
assert len(self.piece_layers[file_info["pieces root"]]) == expected_pieces * 32, (
393+
assert len(self.piece_layers[file_info["pieces root"]]) == expected_pieces, (
241394
f"File {path} does not have the correct number of piece hashes. "
242395
f"Expected {expected_pieces} hashes from file length {file_info['length']} "
243396
f"and piece length {self.info.piece_length}. "
244-
f"Got {len(self.piece_layers[file_info['pieces root']]) / 32}"
397+
f"Got {len(self.piece_layers[file_info['pieces root']])}"
245398
)
246399
return self
247400

248-
def bencode(self) -> bytes:
249-
dumped = self.model_dump_torrent(mode="str")
250-
return bencode_rs.bencode(dumped)
251-
252-
def write(self, path: Path) -> None:
253-
"""Write the torrent to disk"""
254-
with open(path, "wb") as f:
255-
f.write(self.bencode())
256-
257-
@property
258-
def file_size(self) -> int:
259-
"""Size of the generated torrent file, in bytes"""
260-
return len(self.bencode())
261-
262401

263402
def pprint(t: TorrentBase, verbose: int = 0) -> None:
264403
"""

src/torrent_models/types/common.py

Lines changed: 20 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,10 +1,11 @@
11
import sys
2+
from abc import abstractmethod
23
from enum import StrEnum
34
from pathlib import Path
45
from typing import Annotated, NotRequired, TypeAlias
56

67
from annotated_types import Ge, Len
7-
from pydantic import AfterValidator, AnyUrl, Field
8+
from pydantic import AfterValidator, AnyUrl, BaseModel, Field
89

910
from torrent_models.base import ConfiguredBase
1011
from torrent_models.types.serdes import ByteStr
@@ -81,3 +82,21 @@ class GenericFileItem(ConfiguredBase):
8182
length: Annotated[int, Ge(0)]
8283
attr: bytes | None = None
8384
pieces_root: bytes | None = Field(None, alias="pieces root")
85+
86+
87+
class PieceRange(BaseModel):
88+
"""
89+
Parent model for v1 and v2 piece ranges.
90+
91+
Piece ranges provide some description of paths and byte ranges that correspond to a single
92+
verifiable piece and a method for verifying data against them.
93+
94+
Since v1 and v2 data models are substantially different,
95+
their sub-models are also quite different, but provide a common interface through this ABC
96+
"""
97+
98+
piece_idx: int
99+
100+
@abstractmethod
101+
def validate_data(self, data: list[bytes]) -> bool:
102+
"""Check that the provided data matches the piece or root hash"""

src/torrent_models/types/v1.py

Lines changed: 39 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -2,6 +2,7 @@
22
Types used only in v1 (and hybrid) torrents
33
"""
44

5+
import hashlib
56
from typing import Annotated, Self
67

78
from annotated_types import Ge
@@ -15,7 +16,7 @@
1516
from pydantic_core.core_schema import SerializationInfo
1617

1718
from torrent_models.base import ConfiguredBase
18-
from torrent_models.types.common import FilePart, SHA1Hash, _power_of_two
19+
from torrent_models.types.common import FilePart, PieceRange, SHA1Hash, _power_of_two
1920

2021
V1PieceLength = Annotated[int, AfterValidator(_power_of_two)]
2122
"""
@@ -78,3 +79,40 @@ def strict_padfile_naming(self, info: ValidationInfo) -> Self:
7879
str(self.length),
7980
], "strict mode - padfiles must be named `.pad/{length}`"
8081
return self
82+
83+
84+
class FileItemRange(FileItem):
85+
"""A File Item with a byte range, for use with V1PieceRange"""
86+
87+
range_start: int
88+
range_end: int
89+
90+
91+
class V1PieceRange(PieceRange):
92+
"""
93+
Paths and byte ranges that correspond to a single v1
94+
"""
95+
96+
ranges: list[FileItemRange]
97+
piece_hash: SHA1Hash
98+
99+
def validate_data(self, data: list[bytes]) -> bool:
100+
"""
101+
Validate data against hash by concatenating bytes and comparing the SHA1 hash
102+
103+
The user is responsible for providing all-zero bytestrings
104+
for any padding files in the indicated ranges
105+
"""
106+
assert len(data) == len(
107+
self.ranges
108+
), "Need to provide data chunks that correspond to each of the indicated file ranges"
109+
for range, d in zip(self.ranges, data):
110+
assert (range.range_end - range.range_start) == len(d), (
111+
"Provided data chunks must match the sizes indicated by the "
112+
"start and end ranges of each file range"
113+
)
114+
115+
hasher = hashlib.new("sha1")
116+
for d in data:
117+
hasher.update(d)
118+
return self.piece_hash == hasher.digest()

0 commit comments

Comments
 (0)