NVIDIA-NeMo
diff --git a/‎docs/user-guide/running.md‎
Lines changed: 1 addition & 1 deletion b/‎docs/user-guide/running.md‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎docs/user-guide/troubleshooting.md‎
Lines changed: 4 additions & 1 deletion b/‎docs/user-guide/troubleshooting.md‎
Lines changed: 4 additions & 1 deletion
diff --git a/‎src/nemo_safe_synthesizer/llm/metadata.py‎
Lines changed: 18 additions & 9 deletions b/‎src/nemo_safe_synthesizer/llm/metadata.py‎
Lines changed: 18 additions & 9 deletions
diff --git a/‎src/nemo_safe_synthesizer/llm/utils.py‎
Lines changed: 183 additions & 6 deletions b/‎src/nemo_safe_synthesizer/llm/utils.py‎
Lines changed: 183 additions & 6 deletions
diff --git a/‎src/nemo_safe_synthesizer/preflight/__init__.py‎
Lines changed: 2 additions & 2 deletions b/‎src/nemo_safe_synthesizer/preflight/__init__.py‎
Lines changed: 2 additions & 2 deletions
diff --git a/‎src/nemo_safe_synthesizer/preflight/checks/__init__.py‎
Lines changed: 3 additions & 3 deletions b/‎src/nemo_safe_synthesizer/preflight/checks/__init__.py‎
Lines changed: 3 additions & 3 deletions
@@ -275,7 +275,7 @@ execute in order (`config` → `dataframe` → `metadata` → `advisory`).
 |-------|-------|-------------------|
 | `gpu.cuda` | config | PyTorch is importable and a CUDA GPU is visible |
 | `env.inference_key` | config | `NSS_INFERENCE_KEY` is set when PII classification is enabled (warning only) |
-| `env.hf_token` | config | `HF_TOKEN` or `HUGGING_FACE_HUB_TOKEN` is set; warns unconditionally when neither is present so gated-repo downloads don't fail later (warning only) |
+| `env.hf_model_availability` | config | The pretrained model reference is usable locally or can be fetched from Hugging Face; warns about a missing HF token only when online HF access may be needed |
 | `dataset.size` | dataframe | Training split meets the hard minimum row count |
 | `columns.groupby` | dataframe | `group_training_examples_by` column is present and has no nulls |
 | `columns.orderby` | dataframe | `order_training_examples_by` column is present |
 
@@ -467,7 +467,10 @@ check of its own.
 | `no_gpu` | error | `gpu.cuda` | No CUDA GPU detected (required for training or generation) |
 | `low_vram` | warning | `gpu.vram` | Free GPU VRAM may be insufficient |
 | `inference_key_missing` | warning | `env.inference_key` | `NSS_INFERENCE_KEY` not set; PII classification degraded |
-| `hf_token_missing` | warning | `env.hf_token` | Neither `HF_TOKEN` nor `HUGGING_FACE_HUB_TOKEN` set; gated model downloads may fail |
+| `hf_token_missing` | warning | `env.hf_model_availability` | Neither `HF_TOKEN` nor `HUGGING_FACE_HUB_TOKEN` set, and model loading may need online Hugging Face access |
+| `hf_model_not_cached` | warning/error | `env.hf_model_availability` | Hugging Face model is not present in the local cache; severity is error when HF offline mode is enabled |
+| `hf_model_cache_incomplete` | error | `env.hf_model_availability` | Cached Hugging Face model snapshot is missing required config, tokenizer, weights, or shards |
+| `hf_remote_code_not_cached` | warning/error | `env.hf_model_availability` | Trusted model references remote code that is not cached locally; severity is error when HF offline mode is enabled |
 | `preflight.check_crash` | error | (crashing check) | A check raised an unexpected exception; the issue's `check` field names the crashing check and other checks continued running |
 | `column_not_found` | error | `columns.groupby` / `columns.orderby` | Required column missing from dataset, or input DataFrame uses unsupported MultiIndex columns |
 | `column_nulls` | error | `columns.groupby` | Required column contains null values |
 
@@ -28,7 +28,7 @@
 from ..errors import ParameterError
 from ..observability import get_logger
 from ..utils import load_json, write_json
-from .utils import trust_remote_code_for_model
+from .utils import ModelRef
 
 logger = get_logger(__name__)
 
@@ -96,9 +96,12 @@ def from_tokenizer(cls, name: str, tokenizer: PreTrainedTokenizerBase | None = N
         Returns:
             A new ``LLMPromptConfig`` populated from the tokenizer.
         """
-        tokenizer = tokenizer or AutoTokenizer.from_pretrained(
-            name, trust_remote_code=trust_remote_code_for_model(name)
-        )
+        if tokenizer is None:
+            model_ref = ModelRef.parse(name)
+            tokenizer = AutoTokenizer.from_pretrained(
+                model_ref.target(),
+                trust_remote_code=model_ref.trust_remote_code,
+            )
         bos_token = kwargs.get("bos_token", getattr(tokenizer, "bos_token", None))
         bos_token_id = kwargs.get("bos_token_id", getattr(tokenizer, "bos_token_id", None))
         eos_token = kwargs.get("eos_token", getattr(tokenizer, "eos_token", None))
@@ -362,10 +365,11 @@ def populate_derived_fields(cls, data: dict) -> dict:
         """
         if data.get("autoconfig") is None:
             model_name_or_path = data["model_name_or_path"]
+            model_ref = ModelRef.parse(model_name_or_path)
             try:
                 data["autoconfig"] = AutoConfig.from_pretrained(
-                    model_name_or_path,
-                    trust_remote_code=trust_remote_code_for_model(model_name_or_path),
+                    model_ref.target(),
+                    trust_remote_code=model_ref.trust_remote_code,
                 )
             except OSError as err:
                 raise _model_load_parameter_error(model_name_or_path, err) from err
@@ -496,11 +500,16 @@ def _load_config_and_tokenizer(
         Returns:
             A ``(config, tokenizer)`` tuple ready to pass to ``super().__init__``.
         """
-        trust = trust_remote_code_for_model(model_name_or_path)
+        model_ref = ModelRef.parse(model_name_or_path)
         try:
-            config: PretrainedConfig = AutoConfig.from_pretrained(model_name_or_path, trust_remote_code=trust)
+            config: PretrainedConfig = AutoConfig.from_pretrained(
+                model_ref.target(), trust_remote_code=model_ref.trust_remote_code
+            )
             if tokenizer is None:
-                tokenizer = AutoTokenizer.from_pretrained(model_name_or_path, trust_remote_code=trust)
+                tokenizer = AutoTokenizer.from_pretrained(
+                    model_ref.target(),
+                    trust_remote_code=model_ref.trust_remote_code,
+                )
         except OSError as err:
             raise _model_load_parameter_error(model_name_or_path, err) from err
         return config, tokenizer
 
@@ -11,6 +11,7 @@
 from __future__ import annotations
 
 import gc
+import json
 from dataclasses import dataclass
 from fnmatch import fnmatchcase
 from pathlib import Path
@@ -27,7 +28,35 @@
 
 @dataclass(frozen=True, slots=True)
 class ModelRef:
-    """Resolved model reference for local cache and trust policy decisions."""
+    """Resolved model reference for local cache and trust policy decisions.
+
+    Intended public API:
+    - ``parse()`` normalizes a user-supplied model string or path without
+      contacting Hugging Face.
+    - ``target()`` returns the value that should be passed to
+      ``from_pretrained``-style loaders: a local snapshot path when available,
+      otherwise the original model reference.
+    - ``trust_remote_code`` reports whether the reference belongs to a trusted
+      organization after accounting for resolved local HF cache paths.
+    - ``partial_cached_snapshot()`` returns HF's local snapshot path for the
+      repo/revision, even when the snapshot is incomplete.
+    - ``missing_required_components()`` reports whether a local model directory
+      has the components this project expects before an offline load.
+    - ``missing_remote_code_components()`` reports trusted remote-code files
+      referenced by Transformers ``auto_map`` metadata but absent locally.
+
+    Deliberate Hugging Face coupling:
+    repo-id validation, cache-root resolution, cache scanning, snapshot layout,
+    artifact names, tokenizer filenames, and sharded weight index parsing mirror
+    current Hugging Face Hub and Transformers behavior. This is intentional so
+    NSS decisions match the libraries that load the model. If model loading or
+    cache preflight behavior changes after an upstream HF release, inspect this
+    class first.
+
+    Internal helpers are not a generic model-layout abstraction. They should
+    stay close to HF's implementation rather than grow compatibility shims for
+    unrelated storage formats.
+    """
 
     original: str | Path
     repo_id: str | None = None
@@ -36,6 +65,17 @@ class ModelRef:
     cache_root: Path | None = None
 
     trusted_orgs: ClassVar[frozenset[str]] = frozenset({"nvidia"})
+    tokenizer_artifact_names: ClassVar[frozenset[str]] = frozenset(
+        {
+            "tokenizer.json",
+            "tokenizer.model",
+            "sentencepiece.bpe.model",
+            "spiece.model",
+            "vocab.json",
+            "vocab.txt",
+            "merges.txt",
+        }
+    )
 
     @classmethod
     def parse(
@@ -45,8 +85,18 @@ def parse(
         revision: str = "main",
         cache_root: str | Path | None = None,
     ) -> Self:
-        """Parse a model identifier or path without contacting Hugging Face."""
+        """Parse a model identifier or path without contacting Hugging Face.
+
+        This is safe to call in preflight and loader setup because it uses
+        Hugging Face's local cache APIs only. Cached-model hits may still cost a
+        few milliseconds because HF cache scanning walks cache metadata to
+        confirm model artifacts exist.
+        """
         cache_root_path = Path(cache_root) if cache_root is not None else cls._default_hf_cache_root()
+        model_ref = str(model_name)
+        if not model_ref:
+            return cls(original=model_name, revision=revision, cache_root=cache_root_path)
+
         model_path = Path(model_name)
         if model_path.exists():
             repo_id = cls._repo_id_from_hf_cache_path(model_path, cache_root_path)
@@ -58,7 +108,6 @@ def parse(
                 cache_root=cache_root_path,
             )
 
-        model_ref = str(model_name)
         repo_id = cls._repo_id_from_hub_identifier(model_ref)
         local_path = cls._cached_snapshot_for_repo(repo_id, revision, cache_root_path) if repo_id else None
         return cls(
@@ -95,6 +144,12 @@ def _repo_id_from_hub_identifier(model_ref: str) -> str | None:
 
     @staticmethod
     def _repo_id_from_hf_cache_path(path: Path, cache_root: Path) -> str | None:
+        """Return the HF repo id for a path inside the configured Hub cache.
+
+        This relies on ``huggingface_hub.scan_cache_dir`` and the current
+        ``models--org--repo/snapshots/<commit>`` cache model. It is deliberately
+        not a generic path parser.
+        """
         path_resolved = path.resolve(strict=False)
         from huggingface_hub import scan_cache_dir
         from huggingface_hub.errors import CacheNotFound
@@ -114,7 +169,13 @@ def _repo_id_from_hf_cache_path(path: Path, cache_root: Path) -> str | None:
         return None
 
     @staticmethod
-    def _cached_snapshot_for_repo(repo_id: str, revision: str, cache_root: Path) -> Path | None:
+    def _local_snapshot_for_repo(repo_id: str, revision: str, cache_root: Path) -> Path | None:
+        """Return HF's local snapshot path without validating completeness.
+
+        Delegates to ``snapshot_download(local_files_only=True)`` so behavior
+        stays aligned with Hugging Face cache resolution instead of duplicating
+        ref-file lookup rules.
+        """
         from huggingface_hub import snapshot_download
         from huggingface_hub.errors import LocalEntryNotFoundError
 
@@ -129,7 +190,14 @@ def _cached_snapshot_for_repo(repo_id: str, revision: str, cache_root: Path) ->
             )
         except LocalEntryNotFoundError:
             return None
-        if not ModelRef._snapshot_has_model_artifacts(snapshot_path, cache_root):
+        return snapshot_path
+
+    @classmethod
+    def _cached_snapshot_for_repo(cls, repo_id: str, revision: str, cache_root: Path) -> Path | None:
+        snapshot_path = cls._local_snapshot_for_repo(repo_id, revision, cache_root)
+        if snapshot_path is None:
+            return None
+        if not cls._snapshot_has_model_artifacts(snapshot_path, cache_root):
             return None
         return snapshot_path
 
@@ -162,7 +230,11 @@ def _snapshot_has_model_artifacts(cls, snapshot_path: Path, cache_root: Path) ->
 
     @staticmethod
     def _model_artifact_patterns() -> tuple[str, ...]:
-        """Return known model artifact names using HF Hub's public constants."""
+        """Return known model artifact names using HF Hub's public constants.
+
+        Keep this close to Hugging Face's weight naming conventions. New HF
+        artifact names or index formats should be reflected here.
+        """
         from huggingface_hub.constants import (
             FLAX_WEIGHTS_NAME,
             PYTORCH_WEIGHTS_FILE_PATTERN,
@@ -187,6 +259,111 @@ def _model_artifact_patterns() -> tuple[str, ...]:
             "consolidated*.pth",
         )
 
+    @classmethod
+    def _required_component_status(cls, model_dir: Path) -> dict[str, bool]:
+        """Return required local model component presence for a Transformers load.
+
+        The checks are intentionally shaped around ``from_pretrained`` layouts:
+        root ``config.json``, recognized tokenizer files, and HF-style weight
+        files or shard indexes. Revisit this if Transformers changes accepted
+        directory layouts.
+        """
+        files = [path for path in model_dir.rglob("*") if path.is_file()]
+        return {
+            "config": (model_dir / "config.json").is_file(),
+            "tokenizer": any(path.name in cls.tokenizer_artifact_names for path in files),
+            "model weights": cls._has_complete_model_artifacts(model_dir, files),
+        }
+
+    @classmethod
+    def missing_required_components(cls, model_dir: Path) -> list[str]:
+        """Return local model components missing from ``model_dir``."""
+        return [name for name, present in cls._required_component_status(model_dir).items() if not present]
+
+    @classmethod
+    def missing_remote_code_components(cls, model_dir: Path) -> list[str]:
+        """Return trusted remote-code components referenced by config but absent locally."""
+        required = cls._remote_code_components(model_dir)
+        missing: list[str] = []
+        for component, local_path in required:
+            if local_path is None or not (model_dir / local_path).is_file():
+                missing.append(component)
+        return sorted(missing)
+
+    @classmethod
+    def _remote_code_components(cls, model_dir: Path) -> list[tuple[str, Path | None]]:
+        config_path = model_dir / "config.json"
+        try:
+            data = json.loads(config_path.read_text())
+        except (OSError, json.JSONDecodeError):
+            return []
+
+        auto_map = data.get("auto_map")
+        if not isinstance(auto_map, dict):
+            return []
+
+        components: list[tuple[str, Path | None]] = []
+        for value in auto_map.values():
+            for class_ref in cls._auto_map_class_refs(value):
+                component = cls._remote_code_component(class_ref)
+                if component is not None:
+                    components.append(component)
+        return components
+
+    @staticmethod
+    def _auto_map_class_refs(value: object) -> list[str]:
+        if isinstance(value, str):
+            return [value]
+        if isinstance(value, list):
+            return [item for item in value if isinstance(item, str)]
+        return []
+
+    @staticmethod
+    def _remote_code_component(class_ref: str) -> tuple[str, Path | None] | None:
+        repo_id: str | None = None
+        module_ref = class_ref
+        if "--" in class_ref:
+            repo_id, module_ref = class_ref.split("--", 1)
+        if "." not in module_ref:
+            return None
+
+        module_name, _ = module_ref.rsplit(".", 1)
+        module_path = Path(*module_name.split(".")).with_suffix(".py")
+        if repo_id is not None:
+            return f"remote code from {repo_id} ({module_path.as_posix()})", None
+        return module_path.as_posix(), module_path
+
+    @classmethod
+    def _has_complete_model_artifacts(cls, model_dir: Path, files: list[Path]) -> bool:
+        weight_indexes = [path for path in files if path.name.endswith(".index.json")]
+        if weight_indexes:
+            return any(cls._index_references_existing_shards(model_dir, index_path) for index_path in weight_indexes)
+
+        return any(fnmatchcase(path.name, pattern) for path in files for pattern in cls._model_artifact_patterns())
+
+    @staticmethod
+    def _index_references_existing_shards(model_dir: Path, index_path: Path) -> bool:
+        """Return whether an HF weight index references shards present on disk."""
+        try:
+            data = json.loads(index_path.read_text())
+        except (OSError, json.JSONDecodeError):
+            return False
+
+        weight_map = data.get("weight_map")
+        if not isinstance(weight_map, dict) or not weight_map:
+            return False
+
+        shard_names = {name for name in weight_map.values() if isinstance(name, str)}
+        if not shard_names:
+            return False
+        return all((model_dir / name).is_file() for name in shard_names)
+
+    def partial_cached_snapshot(self) -> Path | None:
+        """Return the local HF snapshot for this repo/revision, even if it is partial."""
+        if self.repo_id is None or self.cache_root is None:
+            return None
+        return self._local_snapshot_for_repo(self.repo_id, self.revision, self.cache_root)
+
     @classmethod
     def is_trusted_org(cls, org: str) -> bool:
         """Return whether an organization is allowed to load remote code."""
 
@@ -18,7 +18,7 @@
     CUDAAvailabilityCheck,
     DatasetSizeCheck,
     GroupbyColumnCheck,
-    HFTokenCheck,
+    HFModelAvailabilityCheck,
     InferenceKeyCheck,
     OrderbyColumnCheck,
     OversamplingCheck,
@@ -60,7 +60,7 @@
     "SmallDatasetCheck",
     "DatasetSizeCheck",
     "GroupbyColumnCheck",
-    "HFTokenCheck",
+    "HFModelAvailabilityCheck",
     "InferenceKeyCheck",
     "IssueCollector",
     "MetadataCheck",
 
@@ -26,7 +26,7 @@
 )
 from .environment import (
     CUDAAvailabilityCheck,
-    HFTokenCheck,
+    HFModelAvailabilityCheck,
     InferenceKeyCheck,
     VRAMHeadroomCheck,
 )
@@ -38,7 +38,7 @@
     "SmallDatasetCheck",
     "DatasetSizeCheck",
     "GroupbyColumnCheck",
-    "HFTokenCheck",
+    "HFModelAvailabilityCheck",
     "InferenceKeyCheck",
     "OrderbyColumnCheck",
     "OversamplingCheck",
@@ -58,7 +58,7 @@
     # CONFIG
     CUDAAvailabilityCheck(),
     InferenceKeyCheck(),
-    HFTokenCheck(),
+    HFModelAvailabilityCheck(),
     # DATAFRAME
     DatasetSizeCheck(),
     GroupbyColumnCheck(),