NVIDIA-NeMo
diff --git a/‎Makefile‎
Lines changed: 20 additions & 5 deletions b/‎Makefile‎
Lines changed: 20 additions & 5 deletions
diff --git a/‎pytest.ini‎
Lines changed: 1 addition & 0 deletions b/‎pytest.ini‎
Lines changed: 1 addition & 0 deletions
diff --git a/‎src/nemo_safe_synthesizer/llm/metadata.py‎
Lines changed: 0 additions & 2 deletions b/‎src/nemo_safe_synthesizer/llm/metadata.py‎
Lines changed: 0 additions & 2 deletions
diff --git a/‎tests/TESTING.md‎
Lines changed: 17 additions & 4 deletions b/‎tests/TESTING.md‎
Lines changed: 17 additions & 4 deletions
diff --git a/‎tests/smoke/README.md‎
Lines changed: 26 additions & 24 deletions b/‎tests/smoke/README.md‎
Lines changed: 26 additions & 24 deletions
diff --git a/‎tests/smoke/conftest.py‎
Lines changed: 17 additions & 62 deletions b/‎tests/smoke/conftest.py‎
Lines changed: 17 additions & 62 deletions
diff --git a/‎tests/smoke/test_evaluation_cpu.py‎
Lines changed: 39 additions & 0 deletions b/‎tests/smoke/test_evaluation_cpu.py‎
Lines changed: 39 additions & 0 deletions
@@ -182,13 +182,28 @@ test-ci-slow: ## Run slow tests in CI with coverage
 test-smoke: ## Run CPU smoke tests (~few min, no GPU required)
 	$(PYTEST_CMD) -m "smoke and not requires_gpu"
 
+SMOKE_DIR := tests/smoke
 .PHONY: test-smoke-gpu
 test-smoke-gpu: ## Run GPU smoke tests (requires CUDA)
-# -n 0 disables xdist: CUDA device-side asserts poison the worker, cascading to all subsequent tests.
-# Separate invocations: (1) local tiny-model tests, (2) SmolLM2 Hub test, (3) Unsloth (process-isolated from DP).
-	$(PYTEST_CMD) tests/smoke/ -n 0 -m "requires_gpu and not unsloth and not smollm2"
-	$(PYTEST_CMD) tests/smoke/ -n 0 -m "requires_gpu and smollm2"
-	$(PYTEST_CMD) tests/smoke/ -n 0 -m "requires_gpu and unsloth"
+# -n 0 disables xdist. Groups are split for GPU memory isolation.
+#
+# When adding a new GPU smoke test file:
+#   - Train-only (no vLLM): add pytest.mark.requires_gpu -> auto-discovered below
+#   - Uses vLLM: also add pytest.mark.vllm -> add the file to the vLLM list below
+#   - Uses Unsloth: also add pytest.mark.unsloth -> auto-discovered below
+#   - Downloads from Hub: also add pytest.mark.smollm2 (or similar) -> auto-discovered below
+#
+# 1) Train-only tests share a process (no vLLM, safe to batch).
+	$(PYTEST_CMD) $(SMOKE_DIR)/ -n 0 -m "requires_gpu and not vllm and not smollm2 and not unsloth"
+# 2) Each vLLM test file gets its own process -- vLLM pre-allocates all GPU
+#    memory and never releases it within a process.
+	$(PYTEST_CMD) $(SMOKE_DIR)/test_nss_generation_gpu.py -n 0
+	$(PYTEST_CMD) $(SMOKE_DIR)/test_nss_resume_gpu.py -n 0
+	$(PYTEST_CMD) $(SMOKE_DIR)/test_nss_structured_gen_gpu.py -n 0
+	$(PYTEST_CMD) $(SMOKE_DIR)/test_nss_timeseries_gpu.py -n 0
+# 3) SmolLM2 (Hub download + vLLM) and Unsloth (patches transformers) are marker-isolated.
+	$(PYTEST_CMD) $(SMOKE_DIR)/ -n 0 -m "requires_gpu and smollm2"
+	$(PYTEST_CMD) $(SMOKE_DIR)/ -n 0 -m "requires_gpu and unsloth"
 
 
 E2E_TEST_FILE := $(NSS_ROOT_PATH)/tests/e2e/test_safe_synthesizer.py
 
@@ -18,6 +18,7 @@ markers =
     smoke: Smoke tests - quick tests exercising training/generation hot paths with tiny models
     e2e: End-to-end tests - test the entire pipeline from data to generation to evaluation
     requires_gpu: Test needs CUDA hardware (orthogonal modifier, stacks on smoke/e2e)
+    vllm: Tests using vLLM generation backend (each file runs in its own process for GPU memory isolation)
     smollm2: SmolLM2 Hub download tests (used by Makefile for process isolation)
     unsloth: Unsloth backend tests (process-isolated from DP tests)
     noautouse: Marker to skip autouse fixtures for specific tests
 
@@ -767,8 +767,6 @@ def __init__(
                 add_bos_token_to_prompt=False,
                 add_eos_token_to_prompt=False,
                 tokenizer=tokenizer,
-                bos_token="<|im_start|>",
-                bos_token_id=151644,
                 name=model_name_or_path,
             ),
             model_name_or_path=model_name_or_path,
 
@@ -73,6 +73,9 @@ Defined in `pytest.ini` (`--strict-markers` is enabled):
 | `smoke` | Quick smoke tests (training/generation hot paths, tiny models) |
 | `e2e` | End-to-end pipeline tests (requires CUDA) |
 | `requires_gpu` | Test needs CUDA hardware (modifier, stacks on `smoke`/`e2e`) |
+| `vllm` | Tests using vLLM generation backend (each file runs in its own process for GPU memory isolation) |
+| `smollm2` | SmolLM2 Hub download tests (Makefile uses for process isolation) |
+| `unsloth` | Unsloth backend tests (process-isolated from DP tests) |
 | `noautouse` | Skip autouse fixtures for specific tests |
 
 ## Auto-marking
@@ -117,7 +120,7 @@ Per-module fixtures:
 - Generation/eval/data_processing: shared tokenizer and JSONL fixtures
 - CLI: `mock_workdir(tmp_path)` for tmp_path-based Workdir
 - Config: `basic_parameter`, `training_hyperparams`, `simple_safe_synthesizer_parameters`
-- Smoke: session-scoped `tiny_llama_config`, `stub_tokenizer`, `local_tinyllama_dir`, `iris_df`; function-scoped `base_smoke_config`, `tiny_model`, `_patch_attn_eager`; helpers `train_with_sdk()`, `assert_adapter_saved()`
+- Smoke: session-scoped `tiny_llama_config`, `stub_tokenizer`, `local_tinyllama_dir`, `iris_df`, `base_smoke_config`, `_patch_attn_eager`; function-scoped `tiny_model`; helpers `train_with_sdk()`, `assert_adapter_saved()`
 
 ## Fixture Scoping
 
@@ -133,10 +136,20 @@ Mock Workdir via `mock_workdir(tmp_path)` in `cli/conftest.py`.
 
 ## GPU Isolation Gotcha
 
-Unsloth patches transformers at import time, which poisons Opacus/DP if they share a process. CUDA device-side asserts also cascade across xdist workers. Both e2e and smoke GPU tests require process isolation:
+Two GPU isolation hazards require per-file process isolation (`-n 0`):
 
-- `make test-smoke-gpu` runs three separate single-process (`-n 0`) pytest invocations over `tests/smoke/`, split by `-k` filters: (1) non-unsloth/non-smollm2, (2) smollm2, (3) unsloth.
-- `make test-e2e` splits into `test-e2e-default` + `test-e2e-dp`, each single-process over `tests/e2e/`.
+1. vLLM pre-allocates all GPU memory and never releases it within a process. Tests that call `.generate()` must run in separate processes or later tests OOM.
+2. Unsloth patches transformers at import time, poisoning Opacus/DP if they share a process.
+
+GPU smoke tests use markers to express isolation requirements:
+
+- `requires_gpu`: all GPU tests
+- `vllm`: tests using vLLM generation (each file gets its own process)
+- `smollm2`, `unsloth`: marker-isolated groups (auto-discovered)
+
+`make test-smoke-gpu` uses marker algebra for train-only tests (auto-discovering via `requires_gpu and not vllm and not smollm2 and not unsloth`), explicit file paths for vLLM tests (per-file isolation), and marker selection for SmolLM2/Unsloth. When adding a new vLLM test file, add `pytest.mark.vllm` and also add the file to the Makefile's explicit list.
+
+`make test-e2e` splits into `test-e2e-default` + `test-e2e-dp`, each single-process over `tests/e2e/`.
 
 See [`tests/smoke/README.md`](smoke/README.md) for additional smoke-specific gotchas.
 
 
@@ -1,6 +1,6 @@
 # Smoke Tests
 
-Quick tests that verify training and generation code paths don't crash.
+Quick tests that verify training, generation, evaluation, and PII replacement code paths don't crash.
 They use tiny or small models and run in seconds (CPU) or a few minutes (GPU).
 
 ```bash
@@ -10,9 +10,10 @@ make test-smoke-gpu          # GPU tests (requires CUDA)
 
 ## When should I add a smoke test?
 
-If you're adding a new training backend, generation backend, or model family,
-add a smoke test for it. Same if you're changing how the SDK orchestrates
-train/generate -- those paths are easy to break silently.
+If you're adding a new training backend, generation backend, evaluation
+component, or model family, add a smoke test for it. Same if you're changing
+how the SDK orchestrates train/generate/evaluate -- those paths are easy to
+break silently.
 
 Smoke tests don't check output quality. They just make sure the code runs
 end-to-end without throwing. Use the smallest model that exercises the path
@@ -21,31 +22,24 @@ a real tokenizer/model).
 
 ## GPU Test Process Isolation
 
-GPU smoke tests run in three separate single-process (`-n 0`) pytest invocations to avoid CUDA and import-time conflicts:
+GPU smoke tests use three marker-based isolation groups:
 
-1. Local tiny-model tests (everything except SmolLM2 and Unsloth)
-2. SmolLM2 Hub download test (downloads ~270MB from HuggingFace)
-3. Unsloth backend test (process-isolated from DP tests)
+1. Train-only (`requires_gpu` without `vllm`/`smollm2`/`unsloth`): share a single process, auto-discovered via marker algebra.
+2. vLLM generation (`vllm` marker): each file gets its own process because vLLM pre-allocates all GPU memory and never releases it.
+3. SmolLM2 / Unsloth (`smollm2`, `unsloth` markers): each gets its own process, auto-discovered via markers.
 
-Why: Unsloth monkey-patches transformers at import time, poisoning Opacus/DP if they share a process. CUDA device-side asserts also cascade across xdist workers. The Makefile `test-smoke-gpu` target handles the split automatically via `-k` filters.
-
-Tests use pytestmark decorators:
+When adding a new GPU smoke test, add the appropriate markers to `pytestmark`:
 
 ```python
 pytestmark = [
     pytest.mark.requires_gpu,
+    pytest.mark.vllm,  # if the test calls .generate() (uses vLLM)
     pytest.mark.skipif(not torch.cuda.is_available(), reason="CUDA not available"),
     pytest.mark.skipif(sys.platform == "darwin", reason="Not applicable on macOS"),
 ]
 ```
 
-For SmolLM2 and Unsloth tests, add the marker to a test function:
-
-```python
-@pytest.mark.usefixtures("_register_smollm2")  # for SmolLM2 tests
-def test_full_pipeline_smollm2(...):
-    ...
-```
+If the new file uses vLLM, also add it to the explicit file list in the `test-smoke-gpu` Makefile target (vLLM files need per-file isolation).
 
 ## Things that will bite you
 
@@ -58,14 +52,22 @@ def test_full_pipeline_smollm2(...):
 
 ## What's in `conftest.py`?
 
-The shared fixtures cover both CPU and GPU smoke tests. The most important ones:
+The shared fixtures cover both CPU and GPU smoke tests. Session-scoped fixtures are created once per pytest process; function-scoped fixtures are recreated per test.
 
-- `base_smoke_config` -- default `SafeSynthesizerParameters` pointing at the local tiny model
-- `train_with_sdk(config, data_df, save_path)` -- convenience wrapper around the SDK train flow
-- `assert_adapter_saved(workdir)` -- checks that adapter files landed on disk
+Session-scoped (immutable / read-only):
+
+- `base_smoke_config` -- default `SafeSynthesizerParameters` pointing at the local tiny model (Pydantic frozen model)
 - `_patch_attn_eager` -- the attention implementation workaround mentioned above
-- `tiny_model`, `stub_tokenizer`, `tiny_training_dataset` -- CPU test building blocks
-- `local_tinyllama_dir` -- saves the tiny model to a temp dir so GPU tests don't need internet
+- `stub_tokenizer`, `tiny_llama_config`, `local_tinyllama_dir` -- tokenizer and tiny model on disk
 - `iris_df`, `timeseries_df` -- small DataFrames for training input
 
+Function-scoped (fresh per test):
+
+- `tiny_model` -- randomly initialized `LlamaForCausalLM` (mutated by training)
+
+Helpers (plain functions, not fixtures):
+
+- `train_with_sdk(config, data_df, save_path)` -- convenience wrapper around the SDK train flow
+- `assert_adapter_saved(workdir)` -- checks that adapter files landed on disk
+
 See [CONTRIBUTING.md](../../CONTRIBUTING.md#testing) for the full list of test commands.
@@ -6,72 +6,26 @@
 import pandas as pd
 import pytest
 from datasets import Dataset
-from transformers import AutoConfig, AutoTokenizer, LlamaConfig, LlamaForCausalLM
+from transformers import AutoTokenizer, LlamaConfig, LlamaForCausalLM
 
 from nemo_safe_synthesizer.cli.artifact_structure import Workdir
 from nemo_safe_synthesizer.config.parameters import SafeSynthesizerParameters
-from nemo_safe_synthesizer.defaults import DEFAULT_INSTRUCTION
-from nemo_safe_synthesizer.llm.metadata import LLMPromptConfig, ModelMetadata
 from nemo_safe_synthesizer.sdk.library_builder import SafeSynthesizer
 
-
-class SmolLM2(ModelMetadata):
-    """Test-only metadata for HuggingFaceTB/SmolLM2-135M.
-
-    Moved out of production code because SmolLM2-135M is only used in smoke tests.
-    Uses the tokenizer's native BOS token (not the Qwen2 <|im_start|> override
-    that was previously hardcoded -- that token ID is out of range for SmolLM2's
-    49K vocab and causes CUDA device-side asserts).
-    """
-
-    def __init__(
-        self, model_name_or_path: str, tokenizer=None, rope_scaling_factor: float | None = None, **kwargs
-    ) -> None:
-        tokenizer = AutoTokenizer.from_pretrained(model_name_or_path) if tokenizer is None else tokenizer
-        config = AutoConfig.from_pretrained(model_name_or_path)
-
-        super().__init__(
-            autoconfig=config,
-            instruction=DEFAULT_INSTRUCTION,
-            prompt_config=LLMPromptConfig.from_tokenizer(
-                template="user\n {instruction} {schema} \n assistant\n{prefill}",
-                add_bos_token_to_prompt=False,
-                add_eos_token_to_prompt=False,
-                tokenizer=tokenizer,
-                name=model_name_or_path,
-            ),
-            model_name_or_path=model_name_or_path,
-            rope_scaling=None,
-            rope_parameters_location="autoconfig",
-            **kwargs,
-        )
-
-
-@pytest.fixture
-def _register_smollm2(monkeypatch):
-    """Patch ModelMetadata resolution so the SDK can find SmolLM2 (test-only class)."""
-    original = ModelMetadata.from_str_or_path.__func__
-
-    def patched(cls, model_name_or_path, **kwargs):
-        if "smollm2" in str(model_name_or_path).lower():
-            return SmolLM2(model_name_or_path=str(model_name_or_path), **kwargs)
-        return original(cls, model_name_or_path, **kwargs)
-
-    monkeypatch.setattr(ModelMetadata, "from_str_or_path", classmethod(patched))
+STUB_DATASETS_DIR = Path(__file__).parent.parent / "stub_datasets"
 
 
 @pytest.fixture(scope="session")
 def fixture_stub_tokenizer_path() -> str:
-    """Session-scoped override of the function-scoped fixture in tests/conftest.py."""
+    """Path to the Llama stub tokenizer in tests/stub_tokenizer/."""
     return str(Path(__file__).parent.parent / "stub_tokenizer")
 
 
 @pytest.fixture(scope="session")
-def tiny_llama_config(fixture_stub_tokenizer_path):
+def tiny_llama_config(stub_tokenizer):
     """LlamaConfig with minimal dimensions for fast smoke testing."""
-    tokenizer = AutoTokenizer.from_pretrained(fixture_stub_tokenizer_path)
     return LlamaConfig(
-        vocab_size=tokenizer.vocab_size,  # 32000 -- must match stub tokenizer
+        vocab_size=stub_tokenizer.vocab_size,  # 32000 -- must match stub tokenizer
         hidden_size=64,
         intermediate_size=128,
         num_hidden_layers=2,
@@ -137,9 +91,7 @@ def local_tinyllama_dir(tmp_path_factory, tiny_llama_config, stub_tokenizer):
 @pytest.fixture(scope="session")
 def iris_df():
     """Load iris.csv from stub_datasets."""
-    from tests.conftest import load_test_dataframe
-
-    return load_test_dataframe("iris.csv").copy()
+    return pd.read_csv(STUB_DATASETS_DIR / "iris.csv")
 
 
 @pytest.fixture(scope="session")
@@ -162,7 +114,7 @@ def timeseries_df():
             ],
             "value": [10, 20, 30, 40, 50, 100, 110, 120, 130, 140],
         }
-    ).copy()
+    )
 
 
 @pytest.fixture(scope="session")
@@ -171,15 +123,15 @@ def smoke_save_path(tmp_path_factory):
     return tmp_path_factory.mktemp("smoke-tier-b")
 
 
-@pytest.fixture
+@pytest.fixture(scope="session")
 def base_smoke_config(local_tinyllama_dir):
     """Base SafeSynthesizerParameters shared by all GPU smoke tests with local tiny model.
 
-    Individual tests override specific fields via SafeSynthesizerParameters.from_params(**overrides).
+    Session-scoped because the config is immutable (Pydantic frozen model).
+    Tests that need different settings create their own via SafeSynthesizerParameters.from_params().
     """
     return SafeSynthesizerParameters.from_params(
-        enable_synthesis=True,
-        enable_replace_pii=False,
+        replace_pii=None,
         pretrained_model=str(local_tinyllama_dir),
         use_unsloth=False,
         num_input_records_to_sample=10,
@@ -207,10 +159,11 @@ def train_with_sdk(config: SafeSynthesizerParameters, data_df: pd.DataFrame, sav
     return nss
 
 
-@pytest.fixture
-def _patch_attn_eager(monkeypatch):
+@pytest.fixture(scope="session")
+def _patch_attn_eager():
     """Override attn_implementation from 'flashinfer' (not a valid HF option) to 'sdpa'.
 
+    Session-scoped so class-scoped and function-scoped fixtures can depend on it.
     The HuggingFaceBackend defaults to 'flashinfer' which is not supported by
     HuggingFace's from_pretrained. PyTorch SDPA is universally compatible.
     """
@@ -222,4 +175,6 @@ def patched_build(self, model_kwargs):
         model_kwargs.setdefault("attn_implementation", "sdpa")
         return original_build(self, model_kwargs)
 
-    monkeypatch.setattr(HuggingFaceBackend, "_build_base_framework_params", patched_build)
+    HuggingFaceBackend._build_base_framework_params = patched_build
+    yield
+    HuggingFaceBackend._build_base_framework_params = original_build
@@ -0,0 +1,39 @@
+# SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+
+"""CPU evaluation smoke test -- MultimodalReport.from_dataframes().
+
+Exercises the evaluation pipeline (column distribution, correlation,
+deep structure, text similarity) on a small dataset with privacy
+metrics disabled. Catches dep breakage in the evaluation stack
+(scipy, plotly, sentence-transformers, etc.).
+"""
+
+import pytest
+
+pytest.importorskip(
+    "sentence_transformers", reason="sentence-transformers required (install with: uv sync --extra cpu)"
+)
+
+from nemo_safe_synthesizer.config.parameters import EvaluationParameters, SafeSynthesizerParameters
+from nemo_safe_synthesizer.evaluation.reports.multimodal.multimodal_report import MultimodalReport
+
+
+def test_multimodal_report_from_dataframes(iris_df):
+    """Build a MultimodalReport from iris_df on CPU with privacy metrics off."""
+    config = SafeSynthesizerParameters(
+        evaluation=EvaluationParameters(mia_enabled=False, aia_enabled=False),
+    )
+    reference = iris_df.copy()
+    output = iris_df.sample(frac=0.8, random_state=42).reset_index(drop=True)
+
+    report = MultimodalReport.from_dataframes(
+        reference=reference,
+        output=output,
+        config=config,
+    )
+    assert report is not None
+    assert len(report.components) > 0
+
+    score_dict = report.get_dict()
+    assert "Synthetic Quality Score" in score_dict