NVIDIA-NeMo
diff --git a/‎Makefile‎
Lines changed: 7 additions & 2 deletions b/‎Makefile‎
Lines changed: 7 additions & 2 deletions
diff --git a/‎pyproject.toml‎
Lines changed: 2 additions & 0 deletions b/‎pyproject.toml‎
Lines changed: 2 additions & 0 deletions
diff --git a/‎pytest.ini‎
Lines changed: 1 addition & 0 deletions b/‎pytest.ini‎
Lines changed: 1 addition & 0 deletions
diff --git a/‎tests/conftest.py‎
Lines changed: 6 additions & 0 deletions b/‎tests/conftest.py‎
Lines changed: 6 additions & 0 deletions
diff --git a/‎tests/smoke/README.md‎
Lines changed: 158 additions & 0 deletions b/‎tests/smoke/README.md‎
Lines changed: 158 additions & 0 deletions
diff --git a/‎tests/smoke/__init__.py‎
Lines changed: 2 additions & 0 deletions b/‎tests/smoke/__init__.py‎
Lines changed: 2 additions & 0 deletions
diff --git a/‎tests/smoke/conftest.py‎
Lines changed: 165 additions & 0 deletions b/‎tests/smoke/conftest.py‎
Lines changed: 165 additions & 0 deletions
@@ -168,9 +168,14 @@ test-ci-slow: ## Run slow tests in CI with coverage
 	pushd $(NSS_ROOT_PATH) && \
 	$(PYTEST_CMD) $(PYTEST_CI_OPTS) $(NSS_ROOT_PATH)/tests -m "slow"
 
+.PHONY: test-smoke
+test-smoke: ## Run CPU smoke tests (~few min, no GPU required)
+	$(PYTEST_CMD) -m "smoke and not gpu_integration"
+
 .PHONY: test-gpu-integration
-test-gpu-integration: ## Run GPU integration tests
-	pushd $(NSS_ROOT_PATH) && \
+test-gpu-integration: ## Run GPU integration tests (smoke GPU + e2e)
+	$(PYTEST_CMD) tests/smoke/ -m "gpu_integration" -k "not unsloth" && \
+	$(PYTEST_CMD) tests/smoke/ -m "gpu_integration" -k "unsloth" && \
 	$(PYTEST_CMD) $(NSS_ROOT_PATH)/tests/e2e/ -m "gpu_integration and not e2e" -k default && \
 	$(PYTEST_CMD) $(NSS_ROOT_PATH)/tests/e2e/ -m "gpu_integration and not e2e" -k dp
 
 
@@ -36,6 +36,8 @@ dependencies = [
   "structlog>=25.4.0",
   "colorama>=0.4.6",
   "tqdm>=4.67.1",
+  "setuptools>=80.0.0",
+  
 ]
 
 [dependency-groups]
 
@@ -22,6 +22,7 @@ markers =
     unit: Unit tests - test single classes/functions with no infrastructure dependencies
     unit_test: Legacy marker for unit tests (deprecated, use 'unit' instead)
     noautouse: Marker to skip autouse fixtures for specific tests
+    smoke: Smoke tests - slow unit tests exercising training/generation hot paths with tiny model
 
 # Note: Unit tests (testing single classes/functions with no infrastructure dependencies)
 # do not need markers and are the default test type.
 
@@ -23,6 +23,7 @@ def pytest_collection_modifyitems(config, items):
         "e2e",
         "integration",
         "gpu_integration",
+        "smoke",
     }
 
     for item in items:
@@ -45,6 +46,11 @@ def pytest_collection_modifyitems(config, items):
                 item.add_marker(pytest.mark.integration)
                 marker_names.add("integration")
 
+        if "/smoke/" in path_str:
+            if "smoke" not in marker_names:
+                item.add_marker(pytest.mark.smoke)
+                marker_names.add("smoke")
+
         if not marker_names.intersection(category_markers):
             item.add_marker(pytest.mark.unit)
 
 
@@ -0,0 +1,158 @@
+# Fast Smoke Tests for Training and Generation Hot Paths
+
+Smoke tests exercising training and generation hot paths with a tiny model.
+Run CPU tests without GPU (`make test-smoke`), GPU tests via `make test-gpu-integration`.
+
+## Shared Infrastructure (`conftest.py`)
+
+Key fixtures and helpers available to all smoke tests:
+
+- **`base_smoke_config`** -- `SafeSynthesizerParameters` with local tiny model defaults
+- **`train_with_sdk(config, data_df, save_path)`** -- runs `process_data().train()`, returns the `SafeSynthesizer` instance
+- **`assert_adapter_saved(workdir)`** -- asserts `adapter_config.json` + `*.safetensors` exist
+- **`_patch_attn_eager`** -- monkeypatches `attn_implementation` to `"eager"` for tiny model compatibility
+- **`tiny_model`** / **`stub_tokenizer`** / **`tiny_training_dataset`** -- CPU test primitives
+- **`local_tinyllama_dir`** -- local model directory for GPU tests (no internet needed)
+- **`iris_df`** / **`timeseries_df`** -- small stub datasets
+
+## Design Origin
+
+This test suite was organized into **self-contained work units (WUs)** that were delegated independently. WU1 and WU2 (infrastructure and fixtures) were done first. After that, WU3-WU11 were done in parallel, then consolidated in WU13.
+
+## Dependency Graph and Parallel Execution Strategy
+
+There are only two sequential dependencies: WU1 -> WU2 (foundation). After that, **all remaining WUs are fully independent** -- no test file reads output from another test file. Each GPU test does its own training internally.
+
+```mermaid
+flowchart TD
+    subgraph phase1 ["Phase 1 -- Foundation (sequential, do first)"]
+        WU1["WU1: Infrastructure"] --> WU2["WU2: Shared Fixtures"]
+    end
+
+    subgraph phase2 ["Phase 2 -- All parallelizable (no inter-dependencies)"]
+        direction TB
+        batchA["Batch A: WU0 README"]
+        batchB["Batch B: WU3 CPU Training + WU4 CPU Generation"]
+        batchC["Batch C: WU5 GPU Training + WU10 GPU Adapter Persistence"]
+        batchD["Batch D: WU6 GPU Generation + WU8 GPU Structured Gen"]
+        batchE["Batch E: WU7 GPU Timeseries"]
+        batchF["Batch F: WU9 GPU Resume"]
+        batchG["Batch G: WU11 GPU Full Pipeline + WU12 Unsloth"]
+    end
+
+    subgraph phase3 ["Phase 3 -- Consolidation (fresh agent, after all Phase 2)"]
+        WU13["WU13: DRY pass -- deduplicate, extract helpers, consolidate"]
+    end
+
+    WU2 --> batchA
+    WU2 --> batchB
+    WU2 --> batchC
+    WU2 --> batchD
+    WU2 --> batchE
+    WU2 --> batchF
+    WU2 --> batchG
+
+    batchA --> WU13
+    batchB --> WU13
+    batchC --> WU13
+    batchD --> WU13
+    batchE --> WU13
+    batchF --> WU13
+    batchG --> WU13
+```
+
+
+
+### Recommended Delegation Batches
+
+WU3-WU11 are grouped by **skill similarity** so each assignee has minimal context-switching:
+
+
+| Batch       | WUs             | Why grouped                                                       | Skills needed                                     | Size                      |
+| ----------- | --------------- | ----------------------------------------------------------------- | ------------------------------------------------- | ------------------------- |
+| **Phase 1** | WU0 + WU1 + WU2 | Sequential foundation; one person does all setup                  | pytest fixtures, basic infra                      | ~30 min                   |
+| **B**       | WU3 + WU4       | Both CPU-only, similar Trainer/generate patterns                  | HF Trainer, peft, Opacus, NSS assembler/processor | Medium (2 files, 7 tests) |
+| **C**       | WU5 + WU10      | Both train via SDK then inspect adapter output                    | SafeSynthesizer SDK, PEFT adapter loading         | Medium (2 files, 5 tests) |
+| **D**       | WU6 + WU8       | Both exercise vLLM generation paths                               | VllmBackend, vLLM, structured outputs             | Medium (2 files, 3 tests) |
+| **E**       | WU7             | Specialized timeseries knowledge                                  | TimeseriesBackend, timeseries config              | Small (1 file, 1 test)    |
+| **F**       | WU9             | Specialized resume/Workdir knowledge                              | SafeSynthesizer resume, load_from_save_path       | Small (1 file, 1 test)    |
+| **G**       | WU11 + WU12     | Both need internet + HF Hub; WU12 needs process isolation from DP | SmolLM2, Unsloth, HF Hub, Makefile update         | Medium (2 files, 3 tests) |
+
+
+### Priority order (if fewer hands available)
+
+### Phase 3: Consolidation (sequential, after all Phase 2 batches complete)
+
+
+| Batch | WU   | Purpose                                                                         | Owner                                              |
+| ----- | ---- | ------------------------------------------------------------------------------- | -------------------------------------------------- |
+| **H** | WU13 | Holistic DRY pass: deduplicate configs, extract helpers, consolidate decorators | **Fresh agent** (must NOT have worked on WU3-WU12) |
+
+
+### Priority order (if fewer hands available)
+
+If you cannot parallelize all batches, do them in this order (highest value first):
+
+1. **Phase 1** (A) -- must go first
+2. **C** (GPU SDK training + adapter) -- highest signal for catching regressions
+3. **D** (GPU generation + structured) -- tests the production generation path
+4. **B** (CPU training + generation) -- catches dep breakage without GPU
+5. **E** (timeseries) -- specialized but important path
+6. **F** (resume) -- important production flow
+7. **G** (SmolLM2 + Unsloth) -- lowest priority, needs internet; WU12 also needs Makefile update for process isolation
+8. **H** (consolidation) -- must be last; requires fresh eyes
+
+---
+
+## Critical Gotchas (every WU must know these)
+
+These were discovered by automated council review and affect ALL work units:
+
+1. **Copyright headers**: Every new `.py` file MUST start with:
+  ```python
+   # SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+   # SPDX-License-Identifier: Apache-2.0
+  ```
+   Enforced by `tools/lint/copyright_fixer.py --check .` in CI.
+2. `**lora_r=8` not 4**: vLLM only allows LoRA ranks in {1, 8, 16, 32, 64, ...}. Rank 4 is silently rejected. Use 8 everywhere in smoke tests.
+3. `**holdout=0, max_holdout=0**`: The iris dataset has 151 rows, but `Holdout.train_test_split()` in `src/nemo_safe_synthesizer/holdout/holdout.py` requires >=200 rows. Setting holdout=0 bypasses this.
+4. `**attn_implementation="eager"**`: The `HuggingFaceBackend` defaults to `flash_attention_2` which can fail with head_dim=32 (our tiny model: hidden_size=64 / 2 heads). Override to `"eager"` in smoke tests.
+5. `**vocab_size=32000**`: The stub tokenizer at `tests/stub_tokenizer/` has 32000 tokens. The tiny model config must match this exactly.
+6. `**use_unsloth=False**`: Always set explicitly. The `auto` default may resolve to `True` and pull in Unsloth, which invasively patches transformers.
+7. `**optim="adamw_torch"**` for CPU tests: The production default `paged_adamw_32bit` requires bitsandbytes CUDA kernels.
+
+---
+
+## Summary: File Inventory
+
+
+| File                                              | WU   | Tests              | Marker                      |
+| ------------------------------------------------- | ---- | ------------------ | --------------------------- |
+| `tests/smoke/README.md`                           | WU0  | -- (documentation) | --                          |
+| `tests/smoke/__init__.py`                         | WU1  | --                 | --                          |
+| `tests/smoke/conftest.py`                         | WU2  | -- (fixtures only) | --                          |
+| `tests/smoke/test_training_cpu.py`                | WU3  | 4                  | `smoke` (auto)              |
+| `tests/smoke/test_generation_cpu.py`              | WU4  | 3                  | `smoke` (auto)              |
+| `tests/smoke/test_nss_training_gpu.py`            | WU5  | 2                  | `smoke` + `gpu_integration` |
+| `tests/smoke/test_nss_generation_gpu.py`          | WU6  | 2                  | `smoke` + `gpu_integration` |
+| `tests/smoke/test_nss_timeseries_gpu.py`          | WU7  | 1                  | `smoke` + `gpu_integration` |
+| `tests/smoke/test_nss_structured_gen_gpu.py`      | WU8  | 1                  | `smoke` + `gpu_integration` |
+| `tests/smoke/test_nss_resume_gpu.py`              | WU9  | 1                  | `smoke` + `gpu_integration` |
+| `tests/smoke/test_nss_adapter_persistence_gpu.py` | WU10 | 3                  | `smoke` + `gpu_integration` |
+| `tests/smoke/test_full_pipeline_gpu.py`           | WU11 | 2                  | `smoke` + `gpu_integration` |
+| `tests/smoke/test_nss_unsloth_gpu.py`             | WU12 | 1                  | `smoke` + `gpu_integration` |
+
+
+**Modified files**: `tests/conftest.py`, `pytest.ini`, `Makefile`
+
+**Total**: 21 tests across 10 test files, plus 2 infra files (conftest, init) and 1 README.
+
+## Running
+
+```bash
+# CPU smoke tests only (~10 seconds, no GPU required)
+make test-smoke
+
+# GPU smoke + e2e tests (requires CUDA)
+make test-gpu-integration
+```
@@ -0,0 +1,2 @@
+# SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
@@ -0,0 +1,165 @@
+# SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+
+from pathlib import Path
+
+import pandas as pd
+import pytest
+from datasets import Dataset
+from transformers import AutoTokenizer, LlamaConfig, LlamaForCausalLM
+
+
+@pytest.fixture(scope="session")
+def fixture_stub_tokenizer_path() -> str:
+    """Session-scoped override of the function-scoped fixture in tests/conftest.py."""
+    return str(Path(__file__).parent.parent / "stub_tokenizer")
+
+
+@pytest.fixture(scope="session")
+def tiny_llama_config(fixture_stub_tokenizer_path):
+    """LlamaConfig with minimal dimensions for fast smoke testing."""
+    tokenizer = AutoTokenizer.from_pretrained(fixture_stub_tokenizer_path)
+    return LlamaConfig(
+        vocab_size=tokenizer.vocab_size,  # 32000 -- must match stub tokenizer
+        hidden_size=64,
+        intermediate_size=128,
+        num_hidden_layers=2,
+        num_attention_heads=2,
+        num_key_value_heads=2,
+        max_position_embeddings=128,
+    )
+
+
+@pytest.fixture
+def tiny_model(tiny_llama_config):
+    """Randomly initialized LlamaForCausalLM. Tiny (~few KB), no download."""
+    return LlamaForCausalLM(tiny_llama_config)
+
+
+@pytest.fixture(scope="session")
+def stub_tokenizer(fixture_stub_tokenizer_path):
+    """Load the Llama stub tokenizer from tests/stub_tokenizer/."""
+    return AutoTokenizer.from_pretrained(fixture_stub_tokenizer_path)
+
+
+@pytest.fixture(scope="session")
+def tiny_training_dataset(stub_tokenizer):
+    """~8 tokenized training examples as a datasets.Dataset."""
+    texts = [
+        '{"col1":"a","col2":"1"}',
+        '{"col1":"b","col2":"2"}',
+        '{"col1":"c","col2":"3"}',
+        '{"col1":"d","col2":"4"}',
+        '{"col1":"e","col2":"5"}',
+        '{"col1":"f","col2":"6"}',
+        '{"col1":"g","col2":"7"}',
+        '{"col1":"h","col2":"8"}',
+    ]
+    tokenized = stub_tokenizer(texts, padding="max_length", truncation=True, max_length=64, return_tensors="np")
+    return Dataset.from_dict(
+        {
+            "input_ids": tokenized["input_ids"].tolist(),
+            "attention_mask": tokenized["attention_mask"].tolist(),
+            "labels": tokenized["input_ids"].tolist(),  # labels = input_ids for causal LM
+        }
+    )
+
+
+@pytest.fixture(scope="session")
+def tiny_training_dataset_with_position_ids(tiny_training_dataset):
+    """Training dataset with position_ids column, required by DataCollatorForPrivateTokenClassification."""
+    seq_len = len(tiny_training_dataset[0]["input_ids"])
+    position_ids = [list(range(seq_len))] * len(tiny_training_dataset)
+    return tiny_training_dataset.add_column("position_ids", position_ids)
+
+
+@pytest.fixture(scope="session")
+def local_tinyllama_dir(tmp_path_factory, tiny_llama_config, stub_tokenizer):
+    """Save tiny model + tokenizer to a local dir named with 'tinyllama' for NSS compatibility."""
+    local_dir = tmp_path_factory.mktemp("smoke-tinyllama-model")
+    model = LlamaForCausalLM(tiny_llama_config)
+    model.save_pretrained(local_dir)
+    stub_tokenizer.save_pretrained(local_dir)
+    return local_dir
+
+
+@pytest.fixture(scope="session")
+def iris_df():
+    """Load iris.csv from stub_datasets."""
+    return pd.read_csv(Path(__file__).parent.parent / "stub_datasets" / "iris.csv")
+
+
+@pytest.fixture(scope="session")
+def timeseries_df():
+    """Minimal timeseries stub: 2 groups, 5 rows each, elapsed_seconds."""
+    return pd.DataFrame(
+        {
+            "group_id": ["A", "A", "A", "A", "A", "B", "B", "B", "B", "B"],
+            "elapsed_seconds": [0, 60, 120, 180, 240, 0, 60, 120, 180, 240],
+            "value": [10, 20, 30, 40, 50, 100, 110, 120, 130, 140],
+        }
+    )
+
+
+@pytest.fixture(scope="session")
+def smoke_save_path(tmp_path_factory):
+    """Shared temp directory for Tier B (SmolLM2) train -> generate flow."""
+    return tmp_path_factory.mktemp("smoke-tier-b")
+
+
+@pytest.fixture
+def base_smoke_config(local_tinyllama_dir):
+    """Base SafeSynthesizerParameters shared by all GPU smoke tests with local tiny model.
+
+    Individual tests override specific fields via SafeSynthesizerParameters.from_params(**overrides).
+    """
+    from nemo_safe_synthesizer.config.parameters import SafeSynthesizerParameters
+
+    return SafeSynthesizerParameters.from_params(
+        enable_synthesis=True,
+        enable_replace_pii=False,
+        pretrained_model=str(local_tinyllama_dir),
+        use_unsloth=False,
+        num_input_records_to_sample=10,
+        num_records=5,
+        lora_r=8,
+        holdout=0,
+        max_holdout=0,
+    )
+
+
+def assert_adapter_saved(workdir):
+    """Verify adapter files exist after training.
+
+    Reusable assertion helper for any test that trains via the SDK.
+    """
+    adapter_dir = workdir.train.adapter.path
+    assert (adapter_dir / "adapter_config.json").exists(), "adapter_config.json missing"
+    assert any(adapter_dir.glob("*.safetensors")), "No safetensors files found"
+
+
+def train_with_sdk(config, data_df, save_path):
+    """Run SafeSynthesizer.process_data().train() and return the instance."""
+    from nemo_safe_synthesizer.sdk.library_builder import SafeSynthesizer
+
+    nss = SafeSynthesizer(config=config, save_path=save_path)
+    nss.with_data_source(data_df).process_data().train()
+    return nss
+
+
+@pytest.fixture
+def _patch_attn_eager(monkeypatch):
+    """Override attn_implementation to 'eager' for tiny model compatibility.
+
+    The HuggingFaceBackend defaults to 'flashinfer' which can fail with
+    head_dim=32 (our tiny model: hidden_size=64 / 2 heads).
+    """
+    from nemo_safe_synthesizer.training.huggingface_backend import HuggingFaceBackend
+
+    original = HuggingFaceBackend._build_base_framework_params
+
+    def patched(self, model_kwargs):
+        model_kwargs.setdefault("attn_implementation", "eager")
+        return original(self, model_kwargs)
+
+    monkeypatch.setattr(HuggingFaceBackend, "_build_base_framework_params", patched)
Original file line number	Diff line number	Diff line change
`@@ -36,6 +36,8 @@ dependencies = [`
`36`	`36`	`"structlog>=25.4.0",`
`37`	`37`	`"colorama>=0.4.6",`
`38`	`38`	`"tqdm>=4.67.1",`
	`39`	`+ "setuptools>=80.0.0",`
	`40`	`+`
`39`	`41`	`]`
`40`	`42`
`41`	`43`	`[dependency-groups]`
Original file line number	Diff line number	Diff line change
`@@ -0,0 +1,2 @@`
	`1`	`+# SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.`
	`2`	`+# SPDX-License-Identifier: Apache-2.0`