diff --git a/clients/python/poetry.lock b/clients/python/poetry.lock index 151b1cd694..d8246fb77d 100644 --- a/clients/python/poetry.lock +++ b/clients/python/poetry.lock @@ -1,4 +1,4 @@ -# This file is automatically @generated by Poetry 2.2.1 and should not be changed by hand. +# This file is automatically @generated by Poetry 2.3.3 and should not be changed by hand. [[package]] name = "accessible-pygments" @@ -538,6 +538,7 @@ files = [ {file = "certifi-2024.7.4-py3-none-any.whl", hash = "sha256:c198e21b1289c2ab85ee4e67bb4b4ef3ead0892059901a8d5b622f24a1101e90"}, {file = "certifi-2024.7.4.tar.gz", hash = "sha256:5a1e7645bc0ec61a09e26c36f6106dd4cf40c6db3a1fb6352b0244e7fb057c7b"}, ] +markers = {main = "extra == \"hf\" or extra == \"signing\""} [[package]] name = "cffi" @@ -736,6 +737,7 @@ files = [ {file = "charset_normalizer-3.3.2-cp39-cp39-win_amd64.whl", hash = "sha256:b01b88d45a6fcb69667cd6d2f7a9aeb4bf53760d7fc536bf679ec94fe9f3ff3d"}, {file = "charset_normalizer-3.3.2-py3-none-any.whl", hash = "sha256:3e4d1f6587322d2788836a99c69062fbb091331ec940e02d12d179c1d53e25fc"}, ] +markers = {main = "extra == \"signing\""} [[package]] name = "click" @@ -1198,7 +1200,7 @@ accessible-pygments = ">=0.0.5" beautifulsoup4 = "*" pygments = ">=2.7" sphinx = ">=7.0,<10.0" -sphinx-basic-ng = ">=1.0.0.beta2" +sphinx-basic-ng = ">=1.0.0b2" [[package]] name = "graphql-core" @@ -1580,7 +1582,7 @@ markers = {main = "extra == \"signing\""} [package.dependencies] attrs = ">=22.2.0" -jsonschema-specifications = ">=2023.03.6" +jsonschema-specifications = ">=2023.3.6" referencing = ">=0.28.4" rpds-py = ">=0.7.1" @@ -2189,15 +2191,15 @@ files = [ [[package]] name = "olot" -version = "0.1.16" +version = "0.1.17" description = "oci layers on top" optional = true python-versions = "<4.0,>=3.9" groups = ["main"] markers = "extra == \"olot\"" files = [ - {file = "olot-0.1.16-py3-none-any.whl", hash = "sha256:5baaaeb8ee4a688cb096e8149467fabc730bf5ed15fab1051fa90b09132eb0db"}, - {file = "olot-0.1.16.tar.gz", hash = "sha256:c040065744ede4d0f128dc362fad9115c6664a9bbc97e8859eab399a2bd52144"}, + {file = "olot-0.1.17-py3-none-any.whl", hash = "sha256:32e5251a447f82a1f0a854c5c6642ccfaa99e5a431251e21340f13508fbfb743"}, + {file = "olot-0.1.17.tar.gz", hash = "sha256:0e55538df2fa4cc386a71338ede415266e60fcc2a19c42bfcf435ba38cf786c4"}, ] [package.dependencies] @@ -2970,6 +2972,7 @@ files = [ {file = "requests-2.33.0-py3-none-any.whl", hash = "sha256:3324635456fa185245e24865e810cecec7b4caf933d7eb133dcde67d48cee69b"}, {file = "requests-2.33.0.tar.gz", hash = "sha256:c7ebc5e8b0f21837386ad0e1c8fe8b829fa5f544d8df3b2253bff14ef29d7652"}, ] +markers = {main = "extra == \"signing\""} [package.dependencies] certifi = ">=2023.5.7" @@ -3279,10 +3282,10 @@ files = [ ] [package.dependencies] -botocore = ">=1.37.4,<2.0a.0" +botocore = ">=1.37.4,<2.0a0" [package.extras] -crt = ["botocore[crt] (>=1.37.4,<2.0a.0)"] +crt = ["botocore[crt] (>=1.37.4,<2.0a0)"] [[package]] name = "schemathesis" @@ -3877,6 +3880,7 @@ files = [ {file = "typing_extensions-4.15.0-py3-none-any.whl", hash = "sha256:f0fa19c6845758ab08074a0cfa8b7aecb71c999ca73d62883bc25cc018c4e548"}, {file = "typing_extensions-4.15.0.tar.gz", hash = "sha256:0cea48d173cc12fa28ecabc3b837ea3cf6f38c6d1136f85cbaaf598984861466"}, ] +markers = {docs = "python_version < \"3.13\""} [[package]] name = "typing-inspection" @@ -3919,6 +3923,7 @@ files = [ {file = "urllib3-2.6.3-py3-none-any.whl", hash = "sha256:bf272323e553dfb2e87d9bfd225ca7b0f467b919d7bbd355436d3fd37cb0acd4"}, {file = "urllib3-2.6.3.tar.gz", hash = "sha256:1b62b6884944a57dbe321509ab94fd4d3b307075e0c2eae991ac71ee15ad38ed"}, ] +markers = {main = "extra == \"boto3\" or extra == \"signing\""} [package.extras] brotli = ["brotli (>=1.2.0) ; platform_python_implementation == \"CPython\"", "brotlicffi (>=1.2.0.0) ; platform_python_implementation != \"CPython\""] @@ -4304,4 +4309,4 @@ signing = ["platformdirs", "rh-model-signing"] [metadata] lock-version = "2.1" python-versions = ">= 3.10, < 4.0" -content-hash = "03c802aa17990420317ccca07a4dd2c52aaf6c1423c356191abf06feedd1411a" +content-hash = "5e054c01de10e8dda4e3c3d6ee0f04a3f702784ca59544d981d47eb344b9f8ed" diff --git a/clients/python/pyproject.toml b/clients/python/pyproject.toml index 69e62e9df2..cb1ba0096d 100644 --- a/clients/python/pyproject.toml +++ b/clients/python/pyproject.toml @@ -25,7 +25,7 @@ aiohttp-retry = "^2.8.3" nest-asyncio2 = "^1.7.1" huggingface-hub = { version = ">=0.20.1,<1.8.0", optional = true } -olot = { version = "^0.1.6", optional = true } +olot = { version = "^0.1.17", optional = true } boto3 = { version = "^1.37.34", optional = true } rh-model-signing = { version = "0.1.0", optional = true } platformdirs = { version = "^4.0.0", optional = true } diff --git a/clients/python/src/model_registry/utils.py b/clients/python/src/model_registry/utils.py index 4ce1daf4d3..cb54e8dcd8 100644 --- a/clients/python/src/model_registry/utils.py +++ b/clients/python/src/model_registry/utils.py @@ -393,12 +393,9 @@ def save_to_oci_registry( # noqa: C901 ( complex args >8 ) if auth_file is not None: params["authfile"] = auth_file.name backend_def.pull(base_image, local_image_path, **params) - # Pass top-level entries (dir/*) instead of individual leaf files (dir/**/*). - # olot's tarball_from_file preserves subdirectory structure when given a - # directory, but flattens everything to basename when given individual files. - model_path = Path(model_files_path) - files = [model_path] if model_path.is_file() else sorted(model_path.iterdir()) - oci_layers_on_top(local_image_path, files, modelcard) + # Extract the absolute path from the files found in the path + files = [file[0] for file in _get_files_from_path(model_files_path)] # type: ignore[arg-type] + oci_layers_on_top(local_image_path, files, modelcard, root_dir=model_files_path) backend_def.push(local_image_path, oci_ref, **params) # Return the OCI URI diff --git a/clients/python/tests/conftest.py b/clients/python/tests/conftest.py index 59d49d9f47..8bc154e4d9 100644 --- a/clients/python/tests/conftest.py +++ b/clients/python/tests/conftest.py @@ -281,6 +281,52 @@ def get_temp_dir_with_nested_models(): os.rmdir(temp_dir) +@pytest.fixture +def get_temp_dir_with_deeply_nested_models(): + """Model directory with multiple levels of nesting to verify structure preservation. + + Creates: + my-model/ + ├── README.md + ├── onnx/ + │ ├── model.onnx + │ └── weights/ + │ └── quantized.bin + └── tokenizer/ + ├── config.json + └── vocab.txt + """ + temp_dir = tempfile.mkdtemp() + model_dir = os.path.join(temp_dir, "my-model") + os.makedirs(model_dir) + + dirs = [ + os.path.join(model_dir, "onnx"), + os.path.join(model_dir, "onnx", "weights"), + os.path.join(model_dir, "tokenizer"), + ] + for d in dirs: + os.makedirs(d) + + file_entries = [ + ("README.md", b"# My Model"), + ("onnx/model.onnx", b"fake-onnx-data"), + ("onnx/weights/quantized.bin", b"fake-weights-data"), + ("tokenizer/config.json", b'{"key": "value"}'), + ("tokenizer/vocab.txt", b"hello\nworld"), + ] + file_paths = [] + for rel_path, content in file_entries: + full_path = os.path.join(model_dir, rel_path) + with open(full_path, "wb") as f: + f.write(content) + file_paths.append(full_path) + + yield model_dir, file_paths + + shutil.rmtree(temp_dir) + + @pytest.fixture def get_large_model_dir(): """Creates a directory containing a large model file (300-500MB) for testing file size extremes.""" diff --git a/clients/python/tests/test_utils.py b/clients/python/tests/test_utils.py index c741e6ee59..2e782adabb 100644 --- a/clients/python/tests/test_utils.py +++ b/clients/python/tests/test_utils.py @@ -1,5 +1,6 @@ import json import os +import tarfile from contextlib import contextmanager from pathlib import Path @@ -83,16 +84,24 @@ def test_s3_uri_builder_with_complete_env(): @pytest.mark.e2e(type="oci") -def test_save_to_oci_registry_with_skopeo(get_temp_dir_with_models, get_temp_dir): +def test_save_to_oci_registry_with_skopeo(get_temp_dir_with_deeply_nested_models, get_temp_dir): + """Verify OCI layers preserve directory structure with one layer per file. + + Uses a real skopeo backend and local OCI registry. After save_to_oci_registry + runs, inspects the local OCI layout to confirm: + - Each model file is in its own layer (one layer per file) + - Subdirectory paths are preserved in tar arcnames (not flattened) + """ base_image = "quay.io/mmortari/hello-world-wait:latest" - dest_dir, _ = get_temp_dir_with_models + model_dir, model_files = get_temp_dir_with_deeply_nested_models oci_ref = "localhost:5001/foo/bar:latest" + oci_dest = get_temp_dir save_to_oci_registry( base_image=base_image, oci_ref=oci_ref, - model_files_path=dest_dir, - dest_dir=get_temp_dir, + model_files_path=model_dir, + dest_dir=oci_dest, custom_oci_backend=utils._get_skopeo_backend( push_args=[ "--dest-tls-verify=false", @@ -102,6 +111,58 @@ def test_save_to_oci_registry_with_skopeo(get_temp_dir_with_models, get_temp_dir ), ) + # Inspect the OCI layout to verify layer structure. + # Walk from index.json to an image manifest with layers, handling both + # single-arch images (index -> manifest) and multi-arch images + # (index -> image index -> manifest). + oci_path = Path(oci_dest) + blobs = oci_path / "blobs" / "sha256" + index = json.loads((oci_path / "index.json").read_text()) + digest = index["manifests"][0]["digest"].replace("sha256:", "") + manifest = json.loads((blobs / digest).read_text()) + if "layers" not in manifest: + # Multi-arch: manifest is an image index, follow to first platform + digest = manifest["manifests"][0]["digest"].replace("sha256:", "") + manifest = json.loads((blobs / digest).read_text()) + + # Identify model layers added by olot (they have olot annotations). + # Base image layers don't have these annotations. + model_layers = [ + layer for layer in manifest["layers"] + if "olot.layer.content.inlayerpath" in layer.get("annotations", {}) + ] + + # Each model file should be in its own layer (one layer per file) + assert len(model_layers) == len(model_files), ( + f"Expected {len(model_files)} model layers (one per file), got {len(model_layers)}" + ) + + # Verify directory structure is preserved by checking the in-layer paths + in_layer_paths = sorted( + layer["annotations"]["olot.layer.content.inlayerpath"] + for layer in model_layers + ) + expected_paths = sorted( + "/models/" + os.path.relpath(f, model_dir) for f in model_files + ) + assert in_layer_paths == expected_paths, ( + f"Directory structure not preserved.\n" + f"Expected: {expected_paths}\n" + f"Found: {in_layer_paths}" + ) + + # Also verify the actual tar contents match the annotations + for layer in model_layers: + digest = layer["digest"].replace("sha256:", "") + blob_path = blobs / digest + with tarfile.open(blob_path, "r:") as tar: + file_entries = [m.name for m in tar.getmembers() if m.isfile()] + assert len(file_entries) == 1, ( + f"Expected one file per layer tar, got {file_entries}" + ) + expected_tar_path = layer["annotations"]["olot.layer.content.inlayerpath"].lstrip("/") + assert file_entries[0] == expected_tar_path + def test_save_to_oci_registry_with_custom_backend( get_temp_dir_with_models, get_temp_dir, get_mock_custom_oci_backend @@ -167,9 +228,12 @@ def temp_auth_file_wrapper(auth): def test_save_to_oci_registry_preserves_dir_structure(mocker, tmp_path): - """Verify that subdirectories are passed as top-level entries, not flattened leaf files. + """Verify one layer per file with correct directory structure via root_dir. Regression test for https://github.com/kubeflow/model-registry/issues/2437 + With root_dir support we expect: + - One layer per individual file (not one layer per top-level subdir) + - Original directory structure preserved (paths relative to root_dir) """ model_files_path = tmp_path / "my-model" model_files_path.mkdir() @@ -195,11 +259,21 @@ def test_save_to_oci_registry_preserves_dir_structure(mocker, tmp_path): backend="skopeo", ) - # oci_layers_on_top should receive top-level entries (directories + files), - # NOT recursively flattened individual files. + # oci_layers_on_top should receive individual files (one layer per file), + # NOT top-level directories (which would bundle multiple files per layer). called_files = mock_layers.call_args.args[1] - called_names = sorted(p.name for p in called_files) - assert called_names == ["README.md", "onnx", "tokenizer"] + called_rel_paths = sorted( + str(Path(f).relative_to(model_files_path)) for f in called_files + ) + assert called_rel_paths == [ + "README.md", + "onnx/model.onnx", + "onnx/weights/quantized.bin", + "tokenizer/vocab.txt", + ] + + # root_dir must be passed so olot preserves the directory structure in layers + assert mock_layers.call_args.kwargs["root_dir"] == model_files_path @pytest.mark.e2e(type="oci")