Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
21 changes: 20 additions & 1 deletion olot/basics.py
Original file line number Diff line number Diff line change
Expand Up @@ -38,6 +38,7 @@ def oci_layers_on_top(
*,
labels: typing.Union[dict[str, str], None] = None,
annotations: typing.Union[dict[str, str], None] = None,
root_dir: typing.Union[str, os.PathLike, None] = None,
remove_originals: typing.Union[RemoveOriginals, None] = None,
add_modelpack: typing.Union[bool, None] = None):
"""
Expand All @@ -49,6 +50,7 @@ def oci_layers_on_top(
modelcard: PathLike of the README.md of the ModelCarD, will be added as the last layer with compression and annotations. If indicated, it shouldn't be part of model_files.
labels: labels to be added to the OCI Image Config.
annotations: annotations to be added to the OCI Image Manifest.
root_dir: root directory of the model files. When provided, the relative path of each file from root_dir is used to preserve subdirectory structure in layer arcnames (e.g. a file at root_dir/onnx/model.onnx gets arcname /models/onnx/model.onnx). When None (default), existing flat behavior is preserved.
remove_originals: whether to remove the original content files after having added the layers, default: None.
add_modelpack: whether to add a ModelPack manifest to the multi-arch oci-layout only if not already present, default: None.
"""
Expand All @@ -68,6 +70,14 @@ def oci_layers_on_top(
if model.is_dir():
logger.warning(f"One of the input is a whole directory and will result in non-efficient layer-ing: {model}")

root_dir_resolved = Path(root_dir).resolve() if root_dir is not None else None
if root_dir_resolved is not None:
for model in model_files:
try:
Path(model).relative_to(root_dir_resolved)
except ValueError:
raise ValueError(f"model file '{model}' is not under root_dir '{root_dir_resolved}'") from None

verify_ocilayout(ocilayout)
if check_if_oci_layout_contains_docker_manifests(ocilayout):
logger.warning("OCI layout contains Docker distribution manifests, converting them to OCI format")
Expand All @@ -88,7 +98,16 @@ def oci_layers_on_top(
sha256_path = ocilayout / "blobs" / "sha256"
for model in model_files:
model = Path(model)
new_layer = tarball_from_file(model, sha256_path)
if root_dir_resolved is not None:
rel = model.relative_to(root_dir_resolved)
parent = str(rel.parent)
if parent == ".":
file_prefix = "/models/"
else:
file_prefix = "/models/" + parent + "/"
new_layer = tarball_from_file(model, sha256_path, prefix=file_prefix)
else:
new_layer = tarball_from_file(model, sha256_path)
new_layers[new_layer.layer_digest] = new_layer
if remove_originals:
handle_remove(model)
Expand Down
105 changes: 105 additions & 0 deletions tests/basic_test.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
import os
import tarfile
from pathlib import Path
from olot.utils.files import get_file_hash
import pytest
Expand Down Expand Up @@ -481,3 +482,107 @@ def test_add_labels_and_annotations(tmp_path: Path):
assert manifest0.layers[-1].annotations[ANNOTATION_LAYER_CONTENT_INLAYERPATH] == "/models/README.md"
assert manifest0.layers[-1].annotations[ANNOTATION_LAYER_CONTENT_DIGEST] == "sha256:"+checksum_from_disk2
assert manifest0.layers[-1].annotations[ANNOTATION_LAYER_CONTENT_NAME] == "README.md"



@pytest.mark.parametrize("use_root_dir", [True, False], ids=["with_root_dir", "without_root_dir"])
def test_oci_layers_on_top_nested_files(tmp_path: Path, use_root_dir):
"""Verify that nested model files get correct paths in OCI layers.

With root_dir: each file gets an archive path preserving its subdirectory
structure (e.g. models/quantized/int8/model.onnx).

Without root_dir: all files are flattened to models/<filename>, causing
collisions when files in different subdirectories share the same name.
"""
test_ocilayout5 = get_test_data_path() / "ocilayout5"
target_ocilayout = tmp_path / "myocilayout"
shutil.copytree(test_ocilayout5, target_ocilayout)

# Create a model dir with nested subdirectories and duplicate filenames
model_dir = tmp_path / "my-model"
model_dir.mkdir()
(model_dir / "config.json").write_text('{"top": true}')
(model_dir / "model.onnx").write_bytes(os.urandom(64))
onnx_dir = model_dir / "onnx"
onnx_dir.mkdir()
(onnx_dir / "model.onnx").write_bytes(os.urandom(128))
(onnx_dir / "config.json").write_text('{"onnx": true}')
int8_dir = model_dir / "quantized" / "int8"
int8_dir.mkdir(parents=True)
(int8_dir / "model.onnx").write_bytes(os.urandom(96))

models = sorted(model_dir.rglob("*"), key=lambda p: str(p))
models = [m for m in models if m.is_file()]

if use_root_dir:
oci_layers_on_top(target_ocilayout, models, root_dir=model_dir)
else:
oci_layers_on_top(target_ocilayout, models)

# Extract archive paths from every new layer
ocilayout_root_index = read_ocilayout_root_index(target_ocilayout)
ocilayout_indexes: Dict[str, OCIImageIndex] = crawl_ocilayout_indexes(target_ocilayout, ocilayout_root_index)
ocilayout_manifests: Dict[str, OCIImageManifest] = crawl_ocilayout_manifests(target_ocilayout, ocilayout_indexes, ocilayout_root_index)
manifest0: OCIImageManifest = next(iter(ocilayout_manifests.values()))
new_layers = manifest0.layers[1:] # skip the 1 original base layer

# Check annotations on each layer
in_layer_paths = sorted(
layer.annotations[ANNOTATION_LAYER_CONTENT_INLAYERPATH]
for layer in new_layers
if layer.annotations is not None
)
assert len(in_layer_paths) == len(new_layers)
# Check actual tar contents
all_archive_paths: list[str] = []
for layer in new_layers:
digest = layer.digest.removeprefix("sha256:")
blob = target_ocilayout / "blobs" / "sha256" / digest
with tarfile.open(str(blob), "r") as tar:
all_archive_paths.extend(m.name for m in tar.getmembers() if not m.isdir())

if use_root_dir:
expected = [
"models/config.json",
"models/model.onnx",
"models/onnx/config.json",
"models/onnx/model.onnx",
"models/quantized/int8/model.onnx",
]
else:
# Without root_dir, everything is flattened to models/<filename>.
# Duplicate filenames produce separate layers with the same archive path.
expected = [
"models/config.json",
"models/config.json",
"models/model.onnx",
"models/model.onnx",
"models/model.onnx",
]
assert in_layer_paths == ["/" + p for p in expected]
assert sorted(all_archive_paths) == expected


def test_oci_layers_on_top_root_dir_validation(tmp_path: Path):
"""Verify that root_dir raises ValueError if a file is not under it.
"""
test_ocilayout5 = get_test_data_path() / "ocilayout5"
target_ocilayout = tmp_path / "myocilayout"
shutil.copytree(test_ocilayout5, target_ocilayout)

model_dir = tmp_path / "my-model"
model_dir.mkdir()
(model_dir / "model.onnx").write_bytes(os.urandom(64))

other_dir = tmp_path / "other"
other_dir.mkdir()
(other_dir / "stray.bin").write_bytes(os.urandom(64))

models = [
model_dir / "model.onnx",
other_dir / "stray.bin", # not under model_dir
]

with pytest.raises(ValueError, match="is not under root_dir"):
oci_layers_on_top(target_ocilayout, models, root_dir=model_dir)
Loading