diff --git a/docs/demo.ipynb b/docs/demo.ipynb index 9167d14f..050f97b4 100644 --- a/docs/demo.ipynb +++ b/docs/demo.ipynb @@ -1093,7 +1093,7 @@ "outputs": [], "source": [ "hashing_config = model_signing.hashing.Config().set_ignored_paths(\n", - " paths=[\"README.md\"], ignore_git_paths=True\n", + " paths=[\"README.md\"], ignore_git_paths=True, ignore_att_paths=True\n", ")" ] }, diff --git a/docs/model_signing_format.md b/docs/model_signing_format.md index f0da38c0..0772fc43 100644 --- a/docs/model_signing_format.md +++ b/docs/model_signing_format.md @@ -66,6 +66,10 @@ library after passing it an In-Toto Statement. This API will sign the statement, producing a DSSE envelope, along with a DSSE log entry that is submitted to the transparency log. +## Excluded Files + +Signature and attestation files are automatically excluded from the model signature to allow attestations (SLSA provenance, SBOMs, etc.) to accumulate independently throughout a model's lifecycle. Excluded patterns include `*.sig`, `*.sigstore.json`, and `claims.jsonl`. + ## Example Format Below is an example of the Sigstore bundle showing each of the layers described above. diff --git a/src/model_signing/__init__.py b/src/model_signing/__init__.py index 6d45803e..23391e85 100644 --- a/src/model_signing/__init__.py +++ b/src/model_signing/__init__.py @@ -53,14 +53,14 @@ private_key="key" ).set_hashing_config( model_signing.hashing.Config().set_ignored_paths( - paths=["README.md"], ignore_git_paths=True + paths=["README.md"], ignore_git_paths=True, ignore_att_paths=True ) ).sign("finbert", "finbert.sig") ``` This example generates a signature using a private key based on elliptic curve -cryptography. It also hashes the model by ignoring `README.md` and any git -related file present in the model directory. +cryptography. It also hashes the model by ignoring `README.md`, any git-related +files, and attestation files present in the model directory. We also support signing with signing certificates, using a similar API as above. diff --git a/src/model_signing/_cli.py b/src/model_signing/_cli.py index 2d90f135..c5903db2 100644 --- a/src/model_signing/_cli.py +++ b/src/model_signing/_cli.py @@ -93,6 +93,18 @@ def set_attribute(self, key, value): help="Ignore git-related files when signing or verifying.", ) +# Decorator for the commonly used option to ignore attestation files +_ignore_att_paths_option = click.option( + "--ignore-att-paths/--no-ignore-att-paths", + type=bool, + default=True, + show_default=True, + help=( + "Ignore signature and attestation files " + "(*.sig, *.sigstore.json, claims.jsonl)." + ), +) + # Decorator for the commonly used option to ignore all unsigned files _ignore_unsigned_files_option = click.option( "--ignore_unsigned_files/--no-ignore_unsigned_files", @@ -282,6 +294,7 @@ def _sign() -> None: @_model_path_argument @_ignore_paths_option @_ignore_git_paths_option +@_ignore_att_paths_option @_allow_symlinks_option @_write_signature_option @_sigstore_staging_option @@ -326,6 +339,7 @@ def _sign_sigstore( model_path: pathlib.Path, ignore_paths: Iterable[pathlib.Path], ignore_git_paths: bool, + ignore_att_paths: bool, allow_symlinks: bool, signature: pathlib.Path, use_ambient_credentials: bool, @@ -388,7 +402,9 @@ def _sign_sigstore( ).set_hashing_config( model_signing.hashing.Config() .set_ignored_paths( - paths=ignored, ignore_git_paths=ignore_git_paths + paths=ignored, + ignore_git_paths=ignore_git_paths, + ignore_att_paths=ignore_att_paths, ) .set_allow_symlinks(allow_symlinks) ).sign(model_path, signature) @@ -403,6 +419,7 @@ def _sign_sigstore( @_model_path_argument @_ignore_paths_option @_ignore_git_paths_option +@_ignore_att_paths_option @_allow_symlinks_option @_write_signature_option @_private_key_option @@ -416,6 +433,7 @@ def _sign_private_key( model_path: pathlib.Path, ignore_paths: Iterable[pathlib.Path], ignore_git_paths: bool, + ignore_att_paths: bool, allow_symlinks: bool, signature: pathlib.Path, private_key: pathlib.Path, @@ -442,7 +460,11 @@ def _sign_private_key( private_key=private_key, password=password ).set_hashing_config( model_signing.hashing.Config() - .set_ignored_paths(paths=ignored, ignore_git_paths=ignore_git_paths) + .set_ignored_paths( + paths=ignored, + ignore_git_paths=ignore_git_paths, + ignore_att_paths=ignore_att_paths, + ) .set_allow_symlinks(allow_symlinks) ).sign(model_path, signature) except Exception as err: @@ -456,6 +478,7 @@ def _sign_private_key( @_model_path_argument @_ignore_paths_option @_ignore_git_paths_option +@_ignore_att_paths_option @_allow_symlinks_option @_write_signature_option @_pkcs11_uri_option @@ -463,6 +486,7 @@ def _sign_pkcs11_key( model_path: pathlib.Path, ignore_paths: Iterable[pathlib.Path], ignore_git_paths: bool, + ignore_att_paths: bool, allow_symlinks: bool, signature: pathlib.Path, pkcs11_uri: str, @@ -488,7 +512,11 @@ def _sign_pkcs11_key( pkcs11_uri=pkcs11_uri ).set_hashing_config( model_signing.hashing.Config() - .set_ignored_paths(paths=ignored, ignore_git_paths=ignore_git_paths) + .set_ignored_paths( + paths=ignored, + ignore_git_paths=ignore_git_paths, + ignore_att_paths=ignore_att_paths, + ) .set_allow_symlinks(allow_symlinks) ).sign(model_path, signature) except Exception as err: @@ -502,6 +530,7 @@ def _sign_pkcs11_key( @_model_path_argument @_ignore_paths_option @_ignore_git_paths_option +@_ignore_att_paths_option @_allow_symlinks_option @_write_signature_option @_private_key_option @@ -511,6 +540,7 @@ def _sign_certificate( model_path: pathlib.Path, ignore_paths: Iterable[pathlib.Path], ignore_git_paths: bool, + ignore_att_paths: bool, allow_symlinks: bool, signature: pathlib.Path, private_key: pathlib.Path, @@ -543,7 +573,11 @@ def _sign_certificate( certificate_chain=certificate_chain, ).set_hashing_config( model_signing.hashing.Config() - .set_ignored_paths(paths=ignored, ignore_git_paths=ignore_git_paths) + .set_ignored_paths( + paths=ignored, + ignore_git_paths=ignore_git_paths, + ignore_att_paths=ignore_att_paths, + ) .set_allow_symlinks(allow_symlinks) ).sign(model_path, signature) except Exception as err: @@ -557,6 +591,7 @@ def _sign_certificate( @_model_path_argument @_ignore_paths_option @_ignore_git_paths_option +@_ignore_att_paths_option @_allow_symlinks_option @_write_signature_option @_pkcs11_uri_option @@ -566,6 +601,7 @@ def _sign_pkcs11_certificate( model_path: pathlib.Path, ignore_paths: Iterable[pathlib.Path], ignore_git_paths: bool, + ignore_att_paths: bool, allow_symlinks: bool, signature: pathlib.Path, pkcs11_uri: str, @@ -599,7 +635,11 @@ def _sign_pkcs11_certificate( certificate_chain=certificate_chain, ).set_hashing_config( model_signing.hashing.Config() - .set_ignored_paths(paths=ignored, ignore_git_paths=ignore_git_paths) + .set_ignored_paths( + paths=ignored, + ignore_git_paths=ignore_git_paths, + ignore_att_paths=ignore_att_paths, + ) .set_allow_symlinks(allow_symlinks) ).sign(model_path, signature) except Exception as err: @@ -637,6 +677,7 @@ def _verify() -> None: @_read_signature_option @_ignore_paths_option @_ignore_git_paths_option +@_ignore_att_paths_option @_allow_symlinks_option @_sigstore_staging_option @_trust_config_option @@ -660,6 +701,7 @@ def _verify_sigstore( signature: pathlib.Path, ignore_paths: Iterable[pathlib.Path], ignore_git_paths: bool, + ignore_att_paths: bool, allow_symlinks: bool, identity: str, identity_provider: str, @@ -696,7 +738,9 @@ def _verify_sigstore( ).set_hashing_config( model_signing.hashing.Config() .set_ignored_paths( - paths=ignored, ignore_git_paths=ignore_git_paths + paths=ignored, + ignore_git_paths=ignore_git_paths, + ignore_att_paths=ignore_att_paths, ) .set_allow_symlinks(allow_symlinks) ).set_ignore_unsigned_files(ignore_unsigned_files).verify( @@ -714,6 +758,7 @@ def _verify_sigstore( @_read_signature_option @_ignore_paths_option @_ignore_git_paths_option +@_ignore_att_paths_option @_allow_symlinks_option @click.option( "--public_key", @@ -728,6 +773,7 @@ def _verify_private_key( signature: pathlib.Path, ignore_paths: Iterable[pathlib.Path], ignore_git_paths: bool, + ignore_att_paths: bool, allow_symlinks: bool, public_key: pathlib.Path, ignore_unsigned_files: bool, @@ -753,7 +799,11 @@ def _verify_private_key( public_key=public_key ).set_hashing_config( model_signing.hashing.Config() - .set_ignored_paths(paths=ignored, ignore_git_paths=ignore_git_paths) + .set_ignored_paths( + paths=ignored, + ignore_git_paths=ignore_git_paths, + ignore_att_paths=ignore_att_paths, + ) .set_allow_symlinks(allow_symlinks) ).set_ignore_unsigned_files(ignore_unsigned_files).verify( model_path, signature @@ -770,6 +820,7 @@ def _verify_private_key( @_read_signature_option @_ignore_paths_option @_ignore_git_paths_option +@_ignore_att_paths_option @_allow_symlinks_option @_certificate_root_of_trust_option @click.option( @@ -786,6 +837,7 @@ def _verify_certificate( signature: pathlib.Path, ignore_paths: Iterable[pathlib.Path], ignore_git_paths: bool, + ignore_att_paths: bool, allow_symlinks: bool, certificate_chain: Iterable[pathlib.Path], log_fingerprints: bool, @@ -816,7 +868,11 @@ def _verify_certificate( log_fingerprints=log_fingerprints, ).set_hashing_config( model_signing.hashing.Config() - .set_ignored_paths(paths=ignored, ignore_git_paths=ignore_git_paths) + .set_ignored_paths( + paths=ignored, + ignore_git_paths=ignore_git_paths, + ignore_att_paths=ignore_att_paths, + ) .set_allow_symlinks(allow_symlinks) ).set_ignore_unsigned_files(ignore_unsigned_files).verify( model_path, signature diff --git a/src/model_signing/hashing.py b/src/model_signing/hashing.py index d97ea9de..5607a1d8 100644 --- a/src/model_signing/hashing.py +++ b/src/model_signing/hashing.py @@ -28,7 +28,7 @@ ```python hashing_config = model_signing.hashing.Config().set_ignored_paths( - paths=["README.md"], ignore_git_paths=True + paths=["README.md"], ignore_git_paths=True, ignore_att_paths=True ) signing_config = ( @@ -47,7 +47,7 @@ The API defined here is stable and backwards compatible. """ -from collections.abc import Callable, Iterable +from collections.abc import Callable, Iterable, Iterator import os import pathlib import sys @@ -80,6 +80,26 @@ PathLike: TypeAlias = str | bytes | os.PathLike +# Git-related paths that are optionally ignored when hashing models. +_GIT_IGNORE_PATHS: list[str] = [ + ".git/", + ".gitattributes", + ".github/", + ".gitignore", +] + +# Signature and attestation paths that are ignored by default when +# hashing models. These files should be signed independently to allow +# attestations to accumulate throughout a model's lifecycle without +# invalidating the original signature. Can be controlled via the +# ignore_att_paths parameter. +_ATTESTATION_IGNORE_PATHS: list[str] = [ + "*.sig", # Signature files + "*.sigstore.json", # Individual attestation files + "claims.jsonl", # Bundled attestation files +] + + def hash(model_path: PathLike) -> manifest.Manifest: """Hashes a model using the default configuration. @@ -129,14 +149,16 @@ class Config: This configuration class also supports configuring which paths from the model directory should be ignored. These are files that doesn't impact the behavior of the model, or files that won't be distributed with the model. By - default, only files that are associated with a git repository (`.git`, - `.gitattributes`, `.gitignore`, etc.) are ignored. + default, files associated with a git repository (`.git`, `.gitattributes`, + `.gitignore`, etc.) and attestation files (`*.sig`, `*.sigstore.json`, + `claims.jsonl`) are ignored. """ def __init__(self): """Initializes the default configuration for hashing.""" self._ignored_paths = frozenset() self._ignore_git_paths = True + self._ignore_att_paths = True self.use_file_serialization() self._allow_symlinks = False @@ -160,18 +182,23 @@ def hash( continue ignored_paths.append(full) + def expand_patterns(patterns: list[str]) -> Iterator[pathlib.Path]: + """Expand glob patterns and yield paths.""" + for pattern in patterns: + if "*" in pattern or "?" in pattern or "[" in pattern: + # Expand glob pattern + yield from model_path.glob(pattern) + else: + # Literal path + yield model_path / pattern + + # Optionally exclude signature and attestation files + if self._ignore_att_paths: + ignored_paths.extend(expand_patterns(_ATTESTATION_IGNORE_PATHS)) + + # Optionally exclude git-related files if self._ignore_git_paths: - ignored_paths.extend( - [ - model_path / p - for p in [ - ".git/", - ".gitattributes", - ".github/", - ".gitignore", - ] - ] - ) + ignored_paths.extend(expand_patterns(_GIT_IGNORE_PATHS)) self._serializer.set_allow_symlinks(self._allow_symlinks) @@ -375,7 +402,11 @@ def use_shard_serialization( return self def set_ignored_paths( - self, *, paths: Iterable[PathLike], ignore_git_paths: bool = True + self, + *, + paths: Iterable[PathLike], + ignore_git_paths: bool = True, + ignore_att_paths: bool = True, ) -> Self: """Configures the paths to be ignored during serialization of a model. @@ -390,6 +421,10 @@ def set_ignored_paths( paths: The paths to ignore. ignore_git_paths: Whether to ignore git related paths (default) or include them in the signature. + ignore_att_paths: Whether to ignore signature and attestation files + (default) or include them in the signature. Recommended to keep + True to allow attestations to accumulate throughout the model's + lifecycle without invalidating the original signature. Returns: The new hashing configuration with a new set of ignored paths. @@ -398,6 +433,7 @@ def set_ignored_paths( # the model directory later when hashing. self._ignored_paths = frozenset(pathlib.Path(p) for p in paths) self._ignore_git_paths = ignore_git_paths + self._ignore_att_paths = ignore_att_paths return self def add_ignored_paths( diff --git a/tests/api_test.py b/tests/api_test.py index fda2857e..d258cb9a 100644 --- a/tests/api_test.py +++ b/tests/api_test.py @@ -200,6 +200,7 @@ def test_sign_and_verify(self, base_path, populate_tmpdir): hashing.Config().set_ignored_paths( paths=list(ignore_paths) + [signature], ignore_git_paths=ignore_git_paths, + ignore_att_paths=False, ) ).sign(model_path, signature) @@ -211,6 +212,7 @@ def test_sign_and_verify(self, base_path, populate_tmpdir): hashing.Config().set_ignored_paths( paths=list(ignore_paths) + [signature], ignore_git_paths=ignore_git_paths, + ignore_att_paths=False, ) ).verify(model_path, signature) @@ -232,6 +234,7 @@ def test_sign_and_verify(self, base_path, populate_tmpdir): hashing.Config().set_ignored_paths( paths=list(ignore_paths) + [signature], ignore_git_paths=ignore_git_paths, + ignore_att_paths=False, ) ).sign(model_path, signature) @@ -267,6 +270,7 @@ def test_sign_and_verify(self, base_path, populate_tmpdir): hashing.Config().set_ignored_paths( paths=list(ignore_paths) + [signature], ignore_git_paths=ignore_git_paths, + ignore_att_paths=False, ) ).sign(model_path, signature) @@ -279,6 +283,7 @@ def test_sign_and_verify(self, base_path, populate_tmpdir): hashing.Config().set_ignored_paths( paths=list(ignore_paths) + [signature], ignore_git_paths=ignore_git_paths, + ignore_att_paths=False, ) ).verify(model_path, signature) @@ -302,6 +307,7 @@ def test_sign_and_verify(self, base_path, populate_tmpdir): hashing.Config().set_ignored_paths( paths=list(ignore_paths) + [signature], ignore_git_paths=ignore_git_paths, + ignore_att_paths=False, ) ).sign(model_path, signature) @@ -336,6 +342,7 @@ def test_sign_and_verify_sharded(self, base_path, populate_tmpdir): .set_ignored_paths( paths=list(ignore_paths) + [signature], ignore_git_paths=ignore_git_paths, + ignore_att_paths=False, ) .use_shard_serialization() ).sign(model_path, signature) @@ -349,6 +356,7 @@ def test_sign_and_verify_sharded(self, base_path, populate_tmpdir): hashing.Config().set_ignored_paths( paths=list(ignore_paths) + [signature], ignore_git_paths=ignore_git_paths, + ignore_att_paths=False, ) ) # .verify(model_path, signature) @@ -374,6 +382,7 @@ def test_sign_and_verify_sharded(self, base_path, populate_tmpdir): .set_ignored_paths( paths=list(ignore_paths) + [signature], ignore_git_paths=ignore_git_paths, + ignore_att_paths=False, ) .use_shard_serialization() ).sign(model_path, signature) diff --git a/tests/hashing_config_test.py b/tests/hashing_config_test.py index 3322298e..9ab03be7 100644 --- a/tests/hashing_config_test.py +++ b/tests/hashing_config_test.py @@ -107,3 +107,159 @@ def test_blake3_file_serialization_with_max_workers(tmp_path): # All manifests should be equal assert manifest1 == manifest2 assert manifest1 == manifest3 + + +def test_signature_files_always_excluded(tmp_path): + """Test that signature files are always excluded from hashing.""" + model = tmp_path / "model" + model.mkdir() + (model / "model.txt").write_text("model content") + (model / "model.sig").write_text("signature") + (model / "backup.sig").write_text("old signature") + + cfg = hashing.Config() + manifest = cfg.hash(model) + identifiers = {rd.identifier for rd in manifest.resource_descriptors()} + + # Model file should be included + assert "model.txt" in identifiers + # Signature files should be excluded + assert "model.sig" not in identifiers + assert "backup.sig" not in identifiers + + +def test_attestation_files_always_excluded(tmp_path): + """Test that attestation files are always excluded from hashing.""" + model = tmp_path / "model" + model.mkdir() + (model / "model.txt").write_text("model content") + (model / "model.slsa.sigstore.json").write_text("SLSA provenance") + (model / "model.spdx.sigstore.json").write_text("SBOM") + (model / "claims.jsonl").write_text("bundled attestations") + + cfg = hashing.Config() + manifest = cfg.hash(model) + identifiers = {rd.identifier for rd in manifest.resource_descriptors()} + + # Model file should be included + assert "model.txt" in identifiers + # Attestation files should be excluded + assert "model.slsa.sigstore.json" not in identifiers + assert "model.spdx.sigstore.json" not in identifiers + assert "claims.jsonl" not in identifiers + + +def test_attestation_exclusion_independent_of_ignore_git_paths(tmp_path): + """Test attestations excluded by default regardless of ignore_git_paths. + + Verifies that ignore_git_paths setting doesn't affect attestation + exclusion. + """ + model = tmp_path / "model" + model.mkdir() + (model / "model.txt").write_text("model content") + (model / "model.sig").write_text("signature") + (model / ".gitignore").write_text("*.pyc") + + # Test with ignore_git_paths=False (but ignore_att_paths defaults to True) + cfg_no_git = hashing.Config().set_ignored_paths( + paths=[], ignore_git_paths=False + ) + manifest_no_git = cfg_no_git.hash(model) + identifiers_no_git = { + rd.identifier for rd in manifest_no_git.resource_descriptors() + } + + # .gitignore should be included when ignore_git_paths=False + assert ".gitignore" in identifiers_no_git + # But signature files should still be excluded + # (ignore_att_paths defaults to True) + assert "model.sig" not in identifiers_no_git + + # Test with ignore_git_paths=True (and ignore_att_paths defaults to True) + cfg_with_git = hashing.Config().set_ignored_paths( + paths=[], ignore_git_paths=True + ) + manifest_with_git = cfg_with_git.hash(model) + identifiers_with_git = { + rd.identifier for rd in manifest_with_git.resource_descriptors() + } + + # .gitignore should be excluded when ignore_git_paths=True + assert ".gitignore" not in identifiers_with_git + # Signature files should still be excluded + assert "model.sig" not in identifiers_with_git + + +def test_ignore_att_paths_can_be_disabled(tmp_path): + """Test that attestation exclusion can be disabled. + + Verifies that setting ignore_att_paths=False includes attestation files + in the signature. + """ + model = tmp_path / "model" + model.mkdir() + (model / "model.txt").write_text("model content") + (model / "model.sig").write_text("signature") + (model / "model.slsa.sigstore.json").write_text("SLSA provenance") + + # Test with ignore_att_paths=False (include attestations) + cfg = hashing.Config().set_ignored_paths(paths=[], ignore_att_paths=False) + manifest = cfg.hash(model) + identifiers = {rd.identifier for rd in manifest.resource_descriptors()} + + # Model file should be included + assert "model.txt" in identifiers + # Attestation files should now be included + assert "model.sig" in identifiers + assert "model.slsa.sigstore.json" in identifiers + + +def test_glob_patterns_are_expanded(tmp_path): + """Regression test: glob patterns must be expanded, not treated literally. + + This test validates that glob patterns like "*.sig" are properly expanded + to match actual files, rather than being treated as literal path components. + + Without proper glob expansion, patterns would be interpreted as literal + paths (e.g., looking for a file literally named "*.sig") and would not + match any files, causing attestation files to be incorrectly included in + signatures. + """ + model = tmp_path / "model" + model.mkdir() + + # Create model file + (model / "model.txt").write_text("model content") + + # Create multiple files matching glob patterns + (model / "model.sig").write_text("signature 1") + (model / "backup.sig").write_text("signature 2") + (model / "another.sig").write_text("signature 3") + (model / "model.slsa.sigstore.json").write_text("SLSA provenance") + (model / "scan.sbom.sigstore.json").write_text("SBOM") + (model / "claims.jsonl").write_text("bundled claims") + + # Create a file that should NOT match any patterns + (model / "data.json").write_text("data") + + # Hash with default configuration (ignore_att_paths=True) + cfg = hashing.Config() + manifest = cfg.hash(model) + identifiers = {rd.identifier for rd in manifest.resource_descriptors()} + + # Model and data files should be included + assert "model.txt" in identifiers + assert "data.json" in identifiers + + # ALL files matching *.sig should be excluded + assert "model.sig" not in identifiers + assert "backup.sig" not in identifiers + assert "another.sig" not in identifiers + + # ALL files matching *.sigstore.json should be excluded + assert "model.slsa.sigstore.json" not in identifiers + assert "scan.sbom.sigstore.json" not in identifiers + + # claims.jsonl (literal path) should be excluded + assert "claims.jsonl" not in identifiers