Skip to content

Commit 5ec9891

Browse files
committed
Add repack_hdf5 utility and tests
Introduce repack_hdf5 in mth5.utils.h5_tools to copy HDF5 file-level attributes, groups, datasets, and their attributes into a new file (mirroring h5repack behavior). The function validates input existence, prevents accidental overwrite unless overwrite=True, and returns the output Path. Add tests to verify structure/data/attributes are copied, missing source raises FileNotFoundError, and overwrite behavior raises FileExistsError unless allowed. Uses h5py and pathlib.
1 parent b62b17b commit 5ec9891

2 files changed

Lines changed: 150 additions & 0 deletions

File tree

mth5/utils/h5_tools.py

Lines changed: 85 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,85 @@
1+
"""Utilities for HDF5 maintenance tasks."""
2+
3+
from __future__ import annotations
4+
5+
from pathlib import Path
6+
from typing import Union
7+
8+
import h5py
9+
10+
11+
PathLike = Union[str, Path]
12+
13+
14+
def repack_hdf5(
15+
input_file: PathLike,
16+
output_file: PathLike,
17+
*,
18+
overwrite: bool = False,
19+
) -> Path:
20+
"""Repack an HDF5 file by copying all objects into a new file.
21+
22+
Repacking can reduce file size when metadata or datasets were repeatedly
23+
modified and the source file contains unused internal space.
24+
25+
This mirrors the core idea of ``h5repack``: rewrite all objects into a new
26+
HDF5 container so only live content remains. It does *not* expose the full
27+
``h5repack`` feature set (for example: filter/layout transformation,
28+
chunk-size retuning, and advanced option flags).
29+
30+
Parameters
31+
----------
32+
input_file : str | pathlib.Path
33+
Source HDF5 file to repack.
34+
output_file : str | pathlib.Path
35+
Destination path for the repacked HDF5 file.
36+
overwrite : bool, default=False
37+
If ``True``, overwrite ``output_file`` when it exists.
38+
39+
Returns
40+
-------
41+
pathlib.Path
42+
The output path.
43+
44+
Raises
45+
------
46+
FileNotFoundError
47+
If ``input_file`` does not exist.
48+
FileExistsError
49+
If ``output_file`` exists and ``overwrite`` is ``False``.
50+
51+
Examples
52+
--------
53+
>>> from mth5.utils.h5_tools import repack_hdf5
54+
>>> repacked = repack_hdf5("survey_original.h5", "survey_repacked.h5", overwrite=True)
55+
>>> print(repacked)
56+
survey_repacked.h5
57+
"""
58+
59+
source_path = Path(input_file)
60+
target_path = Path(output_file)
61+
62+
if not source_path.exists():
63+
raise FileNotFoundError(f"Input file does not exist: {source_path}")
64+
65+
if target_path.exists() and not overwrite:
66+
raise FileExistsError(
67+
f"Output file already exists: {target_path}. "
68+
"Set overwrite=True to replace it."
69+
)
70+
71+
if target_path.exists() and overwrite:
72+
target_path.unlink()
73+
74+
with h5py.File(source_path, "r") as source_h5, h5py.File(
75+
target_path, "w"
76+
) as target_h5:
77+
# Copy file-level attributes.
78+
for key, value in source_h5.attrs.items():
79+
target_h5.attrs[key] = value
80+
81+
# Copy root objects recursively (groups, datasets, attributes).
82+
for name in source_h5.keys():
83+
source_h5.copy(name, target_h5, name=name)
84+
85+
return target_path

tests/utils/test_h5_tools.py

Lines changed: 65 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,65 @@
1+
import h5py
2+
import numpy as np
3+
import pytest
4+
5+
from mth5.utils.h5_tools import repack_hdf5
6+
7+
8+
def test_repack_hdf5_copies_structure_data_and_attributes(tmp_path):
9+
"""Repacking should copy groups, datasets, and attrs into a new file."""
10+
source = tmp_path / "source.h5"
11+
target = tmp_path / "repacked.h5"
12+
13+
with h5py.File(source, "w") as h5:
14+
h5.attrs["file_attr"] = "root"
15+
16+
group = h5.create_group("level1")
17+
group.attrs["group_attr"] = 7
18+
19+
data = np.arange(10, dtype=np.float64)
20+
dataset = group.create_dataset("values", data=data)
21+
dataset.attrs["unit"] = "counts"
22+
23+
returned_path = repack_hdf5(source, target)
24+
assert returned_path == target
25+
assert target.exists()
26+
27+
with h5py.File(target, "r") as h5:
28+
assert h5.attrs["file_attr"] == "root"
29+
assert "level1" in h5
30+
31+
group = h5["level1"]
32+
assert group.attrs["group_attr"] == 7
33+
34+
dataset = group["values"]
35+
np.testing.assert_array_equal(dataset[()], np.arange(10, dtype=np.float64))
36+
assert dataset.attrs["unit"] == "counts"
37+
38+
39+
def test_repack_hdf5_raises_if_source_is_missing(tmp_path):
40+
"""Missing source file should raise FileNotFoundError."""
41+
missing_source = tmp_path / "does_not_exist.h5"
42+
target = tmp_path / "output.h5"
43+
44+
with pytest.raises(FileNotFoundError):
45+
repack_hdf5(missing_source, target)
46+
47+
48+
def test_repack_hdf5_respects_overwrite_flag(tmp_path):
49+
"""Output file should not be overwritten unless explicitly requested."""
50+
source = tmp_path / "source.h5"
51+
target = tmp_path / "output.h5"
52+
53+
with h5py.File(source, "w") as h5:
54+
h5.create_dataset("x", data=np.array([1, 2, 3], dtype=np.int64))
55+
56+
# First write creates target.
57+
repack_hdf5(source, target)
58+
59+
# Second write should fail without overwrite.
60+
with pytest.raises(FileExistsError):
61+
repack_hdf5(source, target)
62+
63+
# Overwrite should succeed.
64+
returned_path = repack_hdf5(source, target, overwrite=True)
65+
assert returned_path == target

0 commit comments

Comments
 (0)