promptfoo · mldangelo · Mar 10, 2026 · Mar 10, 2026 · Mar 10, 2026 · coderabbitai
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -81,6 +81,8 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
 
 ### Fixed
 
+- **cli:** include streamed artifacts as SBOM components when `scan --stream --sbom` is used
+- **cli:** exclude HuggingFace download cache bookkeeping files from remote SBOMs and asset lists
 - **security:** require official or explicitly allowlisted JFrog hosts before treating `/artifactory/` URLs as authenticated JFrog endpoints
 - **security:** detect CVE-2024-5480 PyTorch torch.distributed.rpc arbitrary function execution via PythonUDF (CVSS 10.0)
 - **security:** detect CVE-2024-48063 PyTorch torch.distributed.rpc.RemoteModule deserialization RCE via pickle (CVSS 9.8)

diff --git a/modelaudit/cli.py b/modelaudit/cli.py
@@ -1901,9 +1901,19 @@ def enhanced_progress_callback(message, percentage):
     if sbom:
         from .integrations.sbom_generator import generate_sbom_pydantic
 
-        # Use scanned_paths (actual file paths) instead of expanded_paths (original URLs)
-        # to prevent FileNotFoundError when generating SBOM for downloaded content
-        paths_for_sbom = scanned_paths if scanned_paths else expanded_paths
+        # Remote downloads may leave cache internals under the downloaded directory,
+        # and streamed scans may delete files before SBOM generation runs. Reuse the
+        # scanned asset list for those cases so the SBOM reflects actual scanned model
+        # artifacts rather than the raw cache directory contents.
+        asset_paths = list(
+            dict.fromkeys(asset.path for asset in audit_result.assets if asset.path and asset.type != "skipped")
+        )
+        if asset_paths and final_scan_and_delete:
+            paths_for_sbom = asset_paths
+        else:
+            # Use scanned_paths (actual file paths) instead of expanded_paths (original URLs)
+            # to prevent FileNotFoundError when generating SBOM for downloaded content
+            paths_for_sbom = scanned_paths if scanned_paths else expanded_paths
         sbom_text = generate_sbom_pydantic(paths_for_sbom, audit_result)
         with open(sbom, "w", encoding="utf-8") as f:
             f.write(sbom_text)

diff --git a/modelaudit/core.py b/modelaudit/core.py
@@ -662,6 +662,13 @@ def scan_model_directory_or_file(
             for root, _, files in os.walk(path, followlinks=False):
                 for file in files:
                     file_path = os.path.join(root, file)
+
+                    # HuggingFace cache bookkeeping files should never surface as
+                    # scan assets or SBOM components for downloaded models.
+                    if _is_huggingface_cache_file(file_path):
+                        logger.debug(f"Skipping HuggingFace cache file: {file_path}")
+                        continue
+
                     resolved_file = Path(file_path).resolve()
 
                     # Check if this is a HuggingFace cache symlink scenario
@@ -1583,6 +1590,15 @@ def scan_model_streaming(
 
                 # Merge results
                 if scan_result:
+                    metadata_dict = dict(scan_result.metadata or {})
+                    metadata_dict.setdefault("file_size", file_path.stat().st_size)
+
+                    existing_hashes = metadata_dict.get("file_hashes")
+                    if isinstance(existing_hashes, dict):
+                        existing_hashes.setdefault("sha256", file_hash)
+                    else:
+                        metadata_dict["file_hashes"] = {"sha256": file_hash}
+
                     # Use dict-based aggregation to avoid import issues
                     scan_result_dict = {
                         "bytes_scanned": scan_result.bytes_scanned,
@@ -1592,6 +1608,7 @@ def scan_model_streaming(
                         "issues": [issue.__dict__ for issue in (scan_result.issues or [])],
                         "checks": [check.__dict__ for check in (scan_result.checks or [])],
                         "scanners": [scan_result.scanner_name] if scan_result.scanner_name else [],
+                        "file_metadata": {str(file_path): metadata_dict},
                     }
                     results.aggregate_scan_result(scan_result_dict)
 

diff --git a/modelaudit/integrations/sbom_generator.py b/modelaudit/integrations/sbom_generator.py
@@ -77,6 +77,49 @@ def _get_component_type(path: str, metadata: dict[str, Any] | None) -> Component
     return ComponentType.FILE
 
 
+def _resolve_component_size_and_sha256(
+    path: str,
+    metadata: FileMetadataModel | dict[str, Any] | None,
+) -> tuple[int, str]:
+    """Resolve component size/hash from disk, falling back to recorded metadata."""
+    if os.path.exists(path):
+        return os.path.getsize(path), _file_sha256(path)
+
+    file_size = 0
+    sha256 = ""
+
+    if isinstance(metadata, FileMetadataModel):
+        if metadata.file_size is not None:
+            file_size = metadata.file_size
+        if metadata.file_hashes and metadata.file_hashes.sha256:
+            sha256 = metadata.file_hashes.sha256
+    elif isinstance(metadata, dict):
+        raw_size = metadata.get("file_size")
+        if isinstance(raw_size, int):
+            file_size = raw_size
+        raw_hashes = metadata.get("file_hashes")
+        if isinstance(raw_hashes, dict):
+            raw_sha256 = raw_hashes.get("sha256")
+            if isinstance(raw_sha256, str):
+                sha256 = raw_sha256
+
+    return file_size, sha256
+
+
+def _should_skip_sbom_file(path: str) -> bool:
+    """Skip cache bookkeeping files that are not model artifacts."""
+    filename = os.path.basename(path)
+
+    if filename.endswith(".metadata") or filename.endswith(".lock"):
+        return True
+
+    if filename in {".gitignore", ".gitattributes", "main", "HEAD"}:
+        return True
+
+    normalized_path = path.replace("\\", "/")
+    return "/refs/" in normalized_path and filename in {"main", "HEAD"}
+
+
 def _calculate_risk_score(path: str, issues: list[Issue]) -> int:
     """Calculate risk score for a file based on associated issues."""
     score = 0
@@ -179,8 +222,7 @@ def _component_for_file_pydantic(
     issues: list[Issue],
 ) -> Component:
     """Create a CycloneDX component from Pydantic models (type-safe version)."""
-    size = os.path.getsize(path) if os.path.exists(path) else 0
-    sha256 = _file_sha256(path) if os.path.exists(path) else ""
+    size, sha256 = _resolve_component_size_and_sha256(path, metadata)
 
     # Start with basic properties
     props = [Property(name="size", value=str(size))]
@@ -219,8 +261,7 @@ def _component_for_file(
     metadata: dict[str, Any],
     issues: Iterable[dict[str, Any]],
 ) -> Component:
-    size = os.path.getsize(path)
-    sha256 = _file_sha256(path)
+    size, sha256 = _resolve_component_size_and_sha256(path, metadata)
     props = [Property(name="size", value=str(size))]
 
     # Compute risk score based on issues related to this file
@@ -321,7 +362,7 @@ def _component_for_file(
         name=os.path.basename(path),
         bom_ref=path,
         type=component_type,
-        hashes=[HashType.from_hashlib_alg("sha256", sha256)],
+        hashes=[HashType.from_hashlib_alg("sha256", sha256)] if sha256 else [],
         properties=props,
     )
 
@@ -350,6 +391,8 @@ def generate_sbom(paths: Iterable[str], results: dict[str, Any] | Any) -> str:
             for root, _, files in os.walk(input_path):
                 for f in files:
                     fp = os.path.join(root, f)
+                    if _should_skip_sbom_file(fp):
+                        continue
                     meta_model = file_meta.get(fp)
                     # Convert Pydantic model to dict if needed
                     if meta_model is not None and hasattr(meta_model, "model_dump"):
@@ -390,6 +433,8 @@ def generate_sbom_pydantic(paths: Iterable[str], results: ModelAuditResultModel)
             for root, _, files in os.walk(input_path):
                 for f in files:
                     fp = os.path.join(root, f)
+                    if _should_skip_sbom_file(fp):
+                        continue
                     metadata = file_metadata.get(fp)
                     component = _component_for_file_pydantic(fp, metadata, issues)
                     bom.components.add(component)

diff --git a/tests/test_cli.py b/tests/test_cli.py
@@ -1,14 +1,14 @@
 import json
 import os
 import re
-from pathlib import Path
-from unittest.mock import Mock, patch
+from unittest.mock import patch
 
 import pytest
 from click.testing import CliRunner
 
 from modelaudit import __version__
 from modelaudit.cli import cli, expand_paths, format_text_output
+from modelaudit.core import scan_model_directory_or_file
 from modelaudit.models import create_initial_audit_result
 
 
@@ -68,6 +68,15 @@ def create_mock_scan_result(**kwargs):
     if "scanners" in kwargs:
         result.scanner_names = kwargs["scanners"]
 
+    # Add file metadata if provided
+    if "file_metadata" in kwargs:
+        from modelaudit.models import FileMetadataModel
+
+        result.file_metadata = {
+            path: metadata if isinstance(metadata, FileMetadataModel) else FileMetadataModel(**metadata)
+            for path, metadata in kwargs["file_metadata"].items()
+        }
+
     result.finalize_statistics()
     return result
 
@@ -779,58 +788,152 @@ def file_generator():
 @patch("modelaudit.cli.is_huggingface_url")
 @patch("modelaudit.utils.sources.huggingface.download_model_streaming")
 @patch("modelaudit.core.scan_model_streaming")
-def test_scan_huggingface_streaming_sbom_contains_all_components(
-    mock_scan_streaming: Mock, mock_download_streaming: Mock, mock_is_hf_url: Mock, tmp_path: Path
-) -> None:
-    """Regression test for issue #671: --stream should still produce full SBOM components."""
+def test_scan_huggingface_streaming_sbom_includes_streamed_assets(
+    mock_scan_streaming, mock_download_streaming, mock_is_hf_url, tmp_path
+):
+    """Streaming scans should build SBOM components from discovered artifacts."""
     mock_is_hf_url.return_value = True
 
-    # The generator itself is not consumed in this test because scan_model_streaming is mocked.
+    streamed_files = []
+    file_hashes = {}
+    file_sizes = {}
+    for name in ("config.json", "model.safetensors", "tokenizer.json"):
+        file_path = tmp_path / name
+        content = f"streamed content for {name}".encode()
+        file_path.write_bytes(content)
+        streamed_files.append(file_path)
+        import hashlib
+
+        file_hashes[str(file_path)] = hashlib.sha256(content).hexdigest()
+        file_sizes[str(file_path)] = len(content)
+
     def file_generator():
-        yield (tmp_path / "model-00001-of-00002.safetensors", False)
-        yield (tmp_path / "model-00002-of-00002.safetensors", True)
+        for index, file_path in enumerate(streamed_files):
+            yield (file_path, index == len(streamed_files) - 1)
 
     mock_download_streaming.return_value = file_generator()
 
-    streamed_assets = [
-        {
-            "path": str(tmp_path / "model-00001-of-00002.safetensors"),
-            "type": "safetensors",
-            "size": 123,
-        },
-        {
-            "path": str(tmp_path / "model-00002-of-00002.safetensors"),
-            "type": "safetensors",
-            "size": 456,
+    mock_result = create_mock_scan_result(
+        bytes_scanned=sum(file_path.stat().st_size for file_path in streamed_files),
+        files_scanned=len(streamed_files),
+        has_errors=False,
+        assets=[
+            {
+                "path": str(file_path),
+                "type": "streamed",
+                "size": file_sizes[str(file_path)],
+            }
+            for file_path in streamed_files
+        ],
+        file_metadata={
+            str(file_path): {
+                "file_size": file_sizes[str(file_path)],
+                "file_hashes": {"sha256": file_hashes[str(file_path)]},
+            }
+            for file_path in streamed_files
         },
-    ]
-    mock_scan_streaming.return_value = create_mock_scan_result(
-        bytes_scanned=579,
-        files_scanned=2,
-        assets=streamed_assets,
     )
+    mock_scan_streaming.return_value = mock_result
+
+    for file_path in streamed_files:
+        file_path.unlink()
+
+    sbom_file = tmp_path / "streamed.sbom.json"
 
-    sbom_file = tmp_path / "streaming_sbom.json"
     runner = CliRunner()
-    result = runner.invoke(
-        cli,
-        [
-            "scan",
-            "--stream",
-            "--sbom",
-            str(sbom_file),
-            "https://huggingface.co/test/model",
+    result = runner.invoke(cli, ["scan", "--stream", "--quiet", "--sbom", str(sbom_file), "hf://test/model"])
+
+    assert result.exit_code == 0
+    assert sbom_file.exists()
+    sbom_data = json.loads(sbom_file.read_text())
+    components = {component["name"]: component for component in sbom_data["components"]}
+
+    assert set(components) == {file_path.name for file_path in streamed_files}
+    assert "model" not in components
+
+    for file_path in streamed_files:
+        component = components[file_path.name]
+        properties = {prop["name"]: prop["value"] for prop in component["properties"]}
+
+        assert properties["size"] == str(file_sizes[str(file_path)])
+        assert component["hashes"][0]["alg"] == "SHA-256"
+        assert component["hashes"][0]["content"] == file_hashes[str(file_path)]
+
+
+@patch("modelaudit.cli.is_huggingface_url")
+@patch("modelaudit.cli.is_huggingface_file_url", return_value=False)
+@patch("modelaudit.cli.download_model")
+@patch("modelaudit.cli.scan_model_directory_or_file")
+def test_scan_huggingface_sbom_excludes_download_cache_files(
+    mock_scan, mock_download, mock_is_hf_file_url, mock_is_hf_url, tmp_path
+):
+    """Remote SBOM generation should ignore HuggingFace cache bookkeeping files."""
+    mock_is_hf_url.return_value = True
+
+    downloaded_dir = tmp_path / "gpt2"
+    downloaded_dir.mkdir()
+    real_files = {
+        downloaded_dir / "config.json": b'{"model_type":"gpt2"}',
+        downloaded_dir / "merges.txt": b"merge rules",
+        downloaded_dir / "model.safetensors": b"weights",
+    }
+    for file_path, content in real_files.items():
+        file_path.write_bytes(content)
+
+    cache_dir = downloaded_dir / ".cache" / "huggingface" / "download"
+    cache_dir.mkdir(parents=True)
+    (cache_dir / "config.json.metadata").write_text("{}")
+    (cache_dir / ".gitignore").write_text("*\n")
+
+    mock_download.return_value = downloaded_dir
+    mock_scan.return_value = create_mock_scan_result(
+        bytes_scanned=sum(len(content) for content in real_files.values()),
+        files_scanned=len(real_files),
+        has_errors=False,
+        assets=[
+            {
+                "path": str(file_path),
+                "type": "streamed",
+                "size": len(content),
+            }
+            for file_path, content in real_files.items()
         ],
     )
 
+    sbom_file = tmp_path / "downloaded.sbom.json"
+
+    runner = CliRunner()
+    result = runner.invoke(cli, ["scan", "--quiet", "--sbom", str(sbom_file), "hf://test/model"])
+
     assert result.exit_code == 0
-    assert sbom_file.exists()
 
-    sbom_json = json.loads(sbom_file.read_text(encoding="utf-8"))
-    component_names = {component["name"] for component in sbom_json.get("components", [])}
+    sbom_data = json.loads(sbom_file.read_text())
+    component_names = {component["name"] for component in sbom_data["components"]}
+
+    assert component_names == {file_path.name for file_path in real_files}
+    assert "config.json.metadata" not in component_names
+    assert ".gitignore" not in component_names
+
+
+def test_scan_directory_skips_huggingface_cache_bookkeeping(tmp_path):
+    """Directory scans should not surface HuggingFace cache bookkeeping files as assets."""
+    model_dir = tmp_path / "model"
+    model_dir.mkdir()
+    (model_dir / "config.json").write_text('{"model_type":"gpt2"}')
+    (model_dir / "model.safetensors").write_bytes(b"weights")
+
+    cache_dir = model_dir / ".cache" / "huggingface" / "download"
+    cache_dir.mkdir(parents=True)
+    (cache_dir / "config.json.metadata").write_text("{}")
+    (cache_dir / ".gitignore").write_text("*\n")
+
+    result = scan_model_directory_or_file(str(model_dir))
 
-    assert "model-00001-of-00002.safetensors" in component_names
-    assert "model-00002-of-00002.safetensors" in component_names
+    asset_names = {os.path.basename(asset.path) for asset in result.assets}
+    assert "config.json" in asset_names
+    assert "model.safetensors" in asset_names
+    assert "config.json.metadata" not in asset_names
+    assert ".gitignore" not in asset_names
 
 
 @patch("modelaudit.cli.is_huggingface_url")