-
Notifications
You must be signed in to change notification settings - Fork 6
fix: preserve streamed SBOM metadata and exclude HF cache files #673
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from all commits
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -77,6 +77,49 @@ def _get_component_type(path: str, metadata: dict[str, Any] | None) -> Component | |
| return ComponentType.FILE | ||
|
|
||
|
|
||
| def _resolve_component_size_and_sha256( | ||
| path: str, | ||
| metadata: FileMetadataModel | dict[str, Any] | None, | ||
| ) -> tuple[int, str]: | ||
| """Resolve component size/hash from disk, falling back to recorded metadata.""" | ||
| if os.path.exists(path): | ||
| return os.path.getsize(path), _file_sha256(path) | ||
|
|
||
| file_size = 0 | ||
| sha256 = "" | ||
|
|
||
| if isinstance(metadata, FileMetadataModel): | ||
| if metadata.file_size is not None: | ||
| file_size = metadata.file_size | ||
| if metadata.file_hashes and metadata.file_hashes.sha256: | ||
| sha256 = metadata.file_hashes.sha256 | ||
| elif isinstance(metadata, dict): | ||
| raw_size = metadata.get("file_size") | ||
| if isinstance(raw_size, int): | ||
| file_size = raw_size | ||
| raw_hashes = metadata.get("file_hashes") | ||
| if isinstance(raw_hashes, dict): | ||
| raw_sha256 = raw_hashes.get("sha256") | ||
| if isinstance(raw_sha256, str): | ||
| sha256 = raw_sha256 | ||
|
|
||
| return file_size, sha256 | ||
|
|
||
|
|
||
| def _should_skip_sbom_file(path: str) -> bool: | ||
| """Skip cache bookkeeping files that are not model artifacts.""" | ||
| filename = os.path.basename(path) | ||
|
|
||
| if filename.endswith(".metadata") or filename.endswith(".lock"): | ||
| return True | ||
|
|
||
| if filename in {".gitignore", ".gitattributes", "main", "HEAD"}: | ||
| return True | ||
|
|
||
| normalized_path = path.replace("\\", "/") | ||
| return "/refs/" in normalized_path and filename in {"main", "HEAD"} | ||
|
|
||
|
|
||
| def _calculate_risk_score(path: str, issues: list[Issue]) -> int: | ||
| """Calculate risk score for a file based on associated issues.""" | ||
| score = 0 | ||
|
|
@@ -179,8 +222,7 @@ def _component_for_file_pydantic( | |
| issues: list[Issue], | ||
| ) -> Component: | ||
| """Create a CycloneDX component from Pydantic models (type-safe version).""" | ||
| size = os.path.getsize(path) if os.path.exists(path) else 0 | ||
| sha256 = _file_sha256(path) if os.path.exists(path) else "" | ||
| size, sha256 = _resolve_component_size_and_sha256(path, metadata) | ||
|
|
||
| # Start with basic properties | ||
| props = [Property(name="size", value=str(size))] | ||
|
|
@@ -219,8 +261,7 @@ def _component_for_file( | |
| metadata: dict[str, Any], | ||
| issues: Iterable[dict[str, Any]], | ||
| ) -> Component: | ||
| size = os.path.getsize(path) | ||
| sha256 = _file_sha256(path) | ||
| size, sha256 = _resolve_component_size_and_sha256(path, metadata) | ||
| props = [Property(name="size", value=str(size))] | ||
|
|
||
| # Compute risk score based on issues related to this file | ||
|
|
@@ -321,7 +362,7 @@ def _component_for_file( | |
| name=os.path.basename(path), | ||
| bom_ref=path, | ||
| type=component_type, | ||
| hashes=[HashType.from_hashlib_alg("sha256", sha256)], | ||
| hashes=[HashType.from_hashlib_alg("sha256", sha256)] if sha256 else [], | ||
| properties=props, | ||
| ) | ||
|
|
||
|
|
@@ -350,6 +391,8 @@ def generate_sbom(paths: Iterable[str], results: dict[str, Any] | Any) -> str: | |
| for root, _, files in os.walk(input_path): | ||
| for f in files: | ||
| fp = os.path.join(root, f) | ||
| if _should_skip_sbom_file(fp): | ||
| continue | ||
|
Comment on lines
+394
to
+395
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Apply the SBOM skip filter to single-file inputs too. This only filters directory walks. In the new streamed flow, Proposed fix else:
+ if _should_skip_sbom_file(input_path):
+ continue
meta_model = file_meta.get(input_path)
# Convert Pydantic model to dict if needed
if meta_model is not None and hasattr(meta_model, "model_dump"):
meta = meta_model.model_dump()
else:
meta = meta_model or {}
component = _component_for_file(input_path, meta, issues_dicts)
bom.components.add(component) else:
+ if _should_skip_sbom_file(input_path):
+ continue
metadata = file_metadata.get(input_path)
component = _component_for_file_pydantic(input_path, metadata, issues)
bom.components.add(component)Also applies to: 436-437 🤖 Prompt for AI Agents |
||
| meta_model = file_meta.get(fp) | ||
| # Convert Pydantic model to dict if needed | ||
| if meta_model is not None and hasattr(meta_model, "model_dump"): | ||
|
|
@@ -390,6 +433,8 @@ def generate_sbom_pydantic(paths: Iterable[str], results: ModelAuditResultModel) | |
| for root, _, files in os.walk(input_path): | ||
| for f in files: | ||
| fp = os.path.join(root, f) | ||
| if _should_skip_sbom_file(fp): | ||
| continue | ||
| metadata = file_metadata.get(fp) | ||
| component = _component_for_file_pydantic(fp, metadata, issues) | ||
| bom.components.add(component) | ||
|
|
||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -1,14 +1,14 @@ | ||
| import json | ||
| import os | ||
| import re | ||
| from pathlib import Path | ||
| from unittest.mock import Mock, patch | ||
| from unittest.mock import patch | ||
|
|
||
| import pytest | ||
| from click.testing import CliRunner | ||
|
|
||
| from modelaudit import __version__ | ||
| from modelaudit.cli import cli, expand_paths, format_text_output | ||
| from modelaudit.core import scan_model_directory_or_file | ||
| from modelaudit.models import create_initial_audit_result | ||
|
|
||
|
|
||
|
|
@@ -68,6 +68,15 @@ def create_mock_scan_result(**kwargs): | |
| if "scanners" in kwargs: | ||
| result.scanner_names = kwargs["scanners"] | ||
|
|
||
| # Add file metadata if provided | ||
| if "file_metadata" in kwargs: | ||
| from modelaudit.models import FileMetadataModel | ||
|
|
||
| result.file_metadata = { | ||
| path: metadata if isinstance(metadata, FileMetadataModel) else FileMetadataModel(**metadata) | ||
| for path, metadata in kwargs["file_metadata"].items() | ||
| } | ||
|
|
||
| result.finalize_statistics() | ||
| return result | ||
|
|
||
|
|
@@ -779,58 +788,152 @@ def file_generator(): | |
| @patch("modelaudit.cli.is_huggingface_url") | ||
| @patch("modelaudit.utils.sources.huggingface.download_model_streaming") | ||
| @patch("modelaudit.core.scan_model_streaming") | ||
| def test_scan_huggingface_streaming_sbom_contains_all_components( | ||
| mock_scan_streaming: Mock, mock_download_streaming: Mock, mock_is_hf_url: Mock, tmp_path: Path | ||
| ) -> None: | ||
| """Regression test for issue #671: --stream should still produce full SBOM components.""" | ||
| def test_scan_huggingface_streaming_sbom_includes_streamed_assets( | ||
| mock_scan_streaming, mock_download_streaming, mock_is_hf_url, tmp_path | ||
| ): | ||
|
Comment on lines
+791
to
+793
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. 🛠️ Refactor suggestion | 🟠 Major Add the required test annotations. These new tests are missing the repo-required As per coding guidelines, "Use type hints Also applies to: 868-870, 919-919 🤖 Prompt for AI Agents |
||
| """Streaming scans should build SBOM components from discovered artifacts.""" | ||
| mock_is_hf_url.return_value = True | ||
|
|
||
| # The generator itself is not consumed in this test because scan_model_streaming is mocked. | ||
| streamed_files = [] | ||
| file_hashes = {} | ||
| file_sizes = {} | ||
| for name in ("config.json", "model.safetensors", "tokenizer.json"): | ||
| file_path = tmp_path / name | ||
| content = f"streamed content for {name}".encode() | ||
| file_path.write_bytes(content) | ||
| streamed_files.append(file_path) | ||
| import hashlib | ||
|
|
||
| file_hashes[str(file_path)] = hashlib.sha256(content).hexdigest() | ||
| file_sizes[str(file_path)] = len(content) | ||
|
|
||
| def file_generator(): | ||
| yield (tmp_path / "model-00001-of-00002.safetensors", False) | ||
| yield (tmp_path / "model-00002-of-00002.safetensors", True) | ||
| for index, file_path in enumerate(streamed_files): | ||
| yield (file_path, index == len(streamed_files) - 1) | ||
|
|
||
| mock_download_streaming.return_value = file_generator() | ||
|
|
||
| streamed_assets = [ | ||
| { | ||
| "path": str(tmp_path / "model-00001-of-00002.safetensors"), | ||
| "type": "safetensors", | ||
| "size": 123, | ||
| }, | ||
| { | ||
| "path": str(tmp_path / "model-00002-of-00002.safetensors"), | ||
| "type": "safetensors", | ||
| "size": 456, | ||
| mock_result = create_mock_scan_result( | ||
| bytes_scanned=sum(file_path.stat().st_size for file_path in streamed_files), | ||
| files_scanned=len(streamed_files), | ||
| has_errors=False, | ||
| assets=[ | ||
| { | ||
| "path": str(file_path), | ||
| "type": "streamed", | ||
| "size": file_sizes[str(file_path)], | ||
| } | ||
| for file_path in streamed_files | ||
| ], | ||
| file_metadata={ | ||
| str(file_path): { | ||
| "file_size": file_sizes[str(file_path)], | ||
| "file_hashes": {"sha256": file_hashes[str(file_path)]}, | ||
| } | ||
| for file_path in streamed_files | ||
| }, | ||
| ] | ||
| mock_scan_streaming.return_value = create_mock_scan_result( | ||
| bytes_scanned=579, | ||
| files_scanned=2, | ||
| assets=streamed_assets, | ||
| ) | ||
| mock_scan_streaming.return_value = mock_result | ||
|
|
||
| for file_path in streamed_files: | ||
| file_path.unlink() | ||
|
|
||
| sbom_file = tmp_path / "streamed.sbom.json" | ||
|
|
||
| sbom_file = tmp_path / "streaming_sbom.json" | ||
| runner = CliRunner() | ||
| result = runner.invoke( | ||
| cli, | ||
| [ | ||
| "scan", | ||
| "--stream", | ||
| "--sbom", | ||
| str(sbom_file), | ||
| "https://huggingface.co/test/model", | ||
| result = runner.invoke(cli, ["scan", "--stream", "--quiet", "--sbom", str(sbom_file), "hf://test/model"]) | ||
|
|
||
| assert result.exit_code == 0 | ||
| assert sbom_file.exists() | ||
| sbom_data = json.loads(sbom_file.read_text()) | ||
| components = {component["name"]: component for component in sbom_data["components"]} | ||
|
|
||
| assert set(components) == {file_path.name for file_path in streamed_files} | ||
| assert "model" not in components | ||
|
|
||
| for file_path in streamed_files: | ||
| component = components[file_path.name] | ||
| properties = {prop["name"]: prop["value"] for prop in component["properties"]} | ||
|
|
||
| assert properties["size"] == str(file_sizes[str(file_path)]) | ||
| assert component["hashes"][0]["alg"] == "SHA-256" | ||
| assert component["hashes"][0]["content"] == file_hashes[str(file_path)] | ||
|
Comment on lines
+854
to
+860
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Don't couple the test to hash ordering.
Suggested assertion change- assert component["hashes"][0]["alg"] == "SHA-256"
- assert component["hashes"][0]["content"] == file_hashes[str(file_path)]
+ sha256_hash = next(hash_entry for hash_entry in component["hashes"] if hash_entry["alg"] == "SHA-256")
+ assert sha256_hash["content"] == file_hashes[str(file_path)]🤖 Prompt for AI Agents |
||
|
|
||
|
|
||
| @patch("modelaudit.cli.is_huggingface_url") | ||
| @patch("modelaudit.cli.is_huggingface_file_url", return_value=False) | ||
| @patch("modelaudit.cli.download_model") | ||
| @patch("modelaudit.cli.scan_model_directory_or_file") | ||
| def test_scan_huggingface_sbom_excludes_download_cache_files( | ||
| mock_scan, mock_download, mock_is_hf_file_url, mock_is_hf_url, tmp_path | ||
| ): | ||
| """Remote SBOM generation should ignore HuggingFace cache bookkeeping files.""" | ||
| mock_is_hf_url.return_value = True | ||
|
|
||
| downloaded_dir = tmp_path / "gpt2" | ||
| downloaded_dir.mkdir() | ||
| real_files = { | ||
| downloaded_dir / "config.json": b'{"model_type":"gpt2"}', | ||
| downloaded_dir / "merges.txt": b"merge rules", | ||
| downloaded_dir / "model.safetensors": b"weights", | ||
| } | ||
| for file_path, content in real_files.items(): | ||
| file_path.write_bytes(content) | ||
|
|
||
| cache_dir = downloaded_dir / ".cache" / "huggingface" / "download" | ||
| cache_dir.mkdir(parents=True) | ||
| (cache_dir / "config.json.metadata").write_text("{}") | ||
| (cache_dir / ".gitignore").write_text("*\n") | ||
|
|
||
| mock_download.return_value = downloaded_dir | ||
| mock_scan.return_value = create_mock_scan_result( | ||
| bytes_scanned=sum(len(content) for content in real_files.values()), | ||
| files_scanned=len(real_files), | ||
| has_errors=False, | ||
| assets=[ | ||
| { | ||
| "path": str(file_path), | ||
| "type": "streamed", | ||
| "size": len(content), | ||
| } | ||
| for file_path, content in real_files.items() | ||
| ], | ||
| ) | ||
|
|
||
| sbom_file = tmp_path / "downloaded.sbom.json" | ||
|
|
||
| runner = CliRunner() | ||
| result = runner.invoke(cli, ["scan", "--quiet", "--sbom", str(sbom_file), "hf://test/model"]) | ||
|
|
||
| assert result.exit_code == 0 | ||
| assert sbom_file.exists() | ||
|
|
||
| sbom_json = json.loads(sbom_file.read_text(encoding="utf-8")) | ||
| component_names = {component["name"] for component in sbom_json.get("components", [])} | ||
| sbom_data = json.loads(sbom_file.read_text()) | ||
| component_names = {component["name"] for component in sbom_data["components"]} | ||
|
|
||
| assert component_names == {file_path.name for file_path in real_files} | ||
| assert "config.json.metadata" not in component_names | ||
| assert ".gitignore" not in component_names | ||
|
|
||
|
|
||
| def test_scan_directory_skips_huggingface_cache_bookkeeping(tmp_path): | ||
| """Directory scans should not surface HuggingFace cache bookkeeping files as assets.""" | ||
| model_dir = tmp_path / "model" | ||
| model_dir.mkdir() | ||
| (model_dir / "config.json").write_text('{"model_type":"gpt2"}') | ||
| (model_dir / "model.safetensors").write_bytes(b"weights") | ||
|
|
||
| cache_dir = model_dir / ".cache" / "huggingface" / "download" | ||
| cache_dir.mkdir(parents=True) | ||
| (cache_dir / "config.json.metadata").write_text("{}") | ||
| (cache_dir / ".gitignore").write_text("*\n") | ||
|
|
||
| result = scan_model_directory_or_file(str(model_dir)) | ||
|
|
||
| assert "model-00001-of-00002.safetensors" in component_names | ||
| assert "model-00002-of-00002.safetensors" in component_names | ||
| asset_names = {os.path.basename(asset.path) for asset in result.assets} | ||
| assert "config.json" in asset_names | ||
| assert "model.safetensors" in asset_names | ||
| assert "config.json.metadata" not in asset_names | ||
| assert ".gitignore" not in asset_names | ||
|
|
||
|
|
||
| @patch("modelaudit.cli.is_huggingface_url") | ||
|
|
||
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Don't apply the Hugging Face cache skip globally.
_is_huggingface_cache_file()matches names likemainandHEADregardless of location. Because this new branch runs for every directory scan, a normal local artifact with one of those names now disappears from scanning and SBOM generation.Proposed fix
🤖 Prompt for AI Agents