chore: feedback

binaryaaron · binaryaaron · commit 0280ed2ed6ec · 2026-02-20T22:14:19.000Z
Signed-off-by: aagonzales &lt;aagonzales@nvidia.com&gt;
diff --git a/.github/workflows/ci-checks.yml b/.github/workflows/ci-checks.yml
@@ -165,6 +165,27 @@ jobs:
           token: ${{ secrets.CODECOV_TOKEN }}
           fail_ci_if_error: false
 
+  smoke-test:
+    name: Smoke Tests
+    needs: changes
+    if: ${{ needs.changes.outputs.src == 'true' || needs.changes.outputs.test == 'true' || github.event_name == 'workflow_dispatch' }}
+    runs-on: ubuntu-latest
+    steps:
+      - name: checkout
+        uses: actions/checkout@v6
+        with:
+          fetch-depth: 0
+
+      - name: Setup Python environment
+        uses: ./.github/actions/setup-python-env
+        with:
+          bootstrap-tools: "true"
+
+      - name: Run CPU smoke tests
+        run: |
+          make bootstrap-nss cpu
+          make test-smoke
+
   # ---------------------------------------------------------------------------
   # Single required status check for branch protection.
   # Aggregates results from all upstream jobs so that skipped jobs (due to
@@ -173,7 +194,7 @@ jobs:
   ci-status:
     name: CI Status
     if: always() && !cancelled()
-    needs: [changes, format, lint, typecheck, unit-test]
+    needs: [changes, format, lint, typecheck, unit-test, smoke-test]
     runs-on: ubuntu-latest
     steps:
       - name: Check job results
@@ -182,7 +203,8 @@ jobs:
           echo "format:    ${{ needs.format.result }}"
           echo "lint:      ${{ needs.lint.result }}"
           echo "typecheck: ${{ needs.typecheck.result }}"
-          echo "unit-test: ${{ needs.unit-test.result }}"
+          echo "unit-test:  ${{ needs.unit-test.result }}"
+          echo "smoke-test: ${{ needs.smoke-test.result }}"
 
           if [[ "${{ contains(needs.*.result, 'failure') }}" == "true" ]]; then
             echo "::error::One or more CI jobs failed"
diff --git a/.github/workflows/gpu-tests.yml b/.github/workflows/gpu-tests.yml
@@ -75,11 +75,16 @@ jobs:
           python-version: ${{ matrix.python-version }}
           bootstrap-tools: "true"
 
+      - name: Bootstrap CUDA environment
+        run: make bootstrap-nss cu128
+
+      - name: Run GPU smoke tests
+        timeout-minutes: 20
+        run: make test-gpu-integration
+
       - name: Run GPU E2E tests
         timeout-minutes: 45
-        run: |
-          make bootstrap-nss cu128
-          make test-e2e
+        run: make test-e2e
 
   # ---------------------------------------------------------------------------
   # Single required status check for branch protection.
diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md
@@ -268,6 +268,9 @@ make test-slow
 # Run SDK-related tests (config, sdk, cli, api)
 make test-sdk-related
 
+# Run CPU smoke tests (~10 seconds, no GPU required)
+make test-smoke
+
 # Run GPU integration tests (requires CUDA)
 make test-gpu-integration
 
@@ -281,6 +284,8 @@ make test-ci-container
 uv run pytest tests/cli/test_run.py
 ```
 
+Smoke tests exercise training and generation hot paths with a tiny model. See [`tests/smoke/README.md`](tests/smoke/README.md) for details on shared fixtures, gotchas, and when to add new smoke tests.
+
 ### Test Requirements
 
 Before submitting a PR:
diff --git a/Makefile b/Makefile
@@ -159,9 +159,9 @@ test-sdk-related: ## Run SDK-related tests (config, sdk, cli, api)
 		$(NSS_ROOT_PATH)/tests/api
 
 .PHONY: test-ci
-test-ci: ## Run CI unit tests excluding slow and GPU tests
+test-ci: ## Run CI unit tests excluding slow, GPU, and smoke tests
 	pushd $(NSS_ROOT_PATH) && \
-	$(PYTEST_CMD) $(PYTEST_CI_OPTS) $(NSS_ROOT_PATH)/tests -m "not e2e and not gpu_integration and not slow"
+	$(PYTEST_CMD) $(PYTEST_CI_OPTS) $(NSS_ROOT_PATH)/tests -m "not e2e and not gpu_integration and not slow and not smoke"
 
 .PHONY: test-ci-slow
 test-ci-slow: ## Run slow tests in CI with coverage
diff --git a/pytest.ini b/pytest.ini
@@ -22,7 +22,7 @@ markers =
     unit: Unit tests - test single classes/functions with no infrastructure dependencies
     unit_test: Legacy marker for unit tests (deprecated, use 'unit' instead)
     noautouse: Marker to skip autouse fixtures for specific tests
-    smoke: Smoke tests - slow unit tests exercising training/generation hot paths with tiny model
+    smoke: Smoke tests - quick tests exercising training/generation hot paths with tiny models
 
 # Note: Unit tests (testing single classes/functions with no infrastructure dependencies)
 # do not need markers and are the default test type.
diff --git a/src/nemo_safe_synthesizer/llm/metadata.py b/src/nemo_safe_synthesizer/llm/metadata.py
@@ -262,7 +262,7 @@ def save_metadata(self) -> None:
 
     @classmethod
     def from_str_or_path(cls: type["ModelMetadata"], model_name_or_path: Path | str, **kwargs) -> ModelMetadata:
-        classes = TinyLlama, Qwen, Llama32, SmolLM2, SmolLM3, Mistral, Nemotron, Granite
+        classes = TinyLlama, Qwen, Llama32, SmolLM3, Mistral, Nemotron, Granite
         for class_ in classes:
             if str(class_.__name__).lower() in str(model_name_or_path).lower():
                 return class_(model_name_or_path=str(model_name_or_path), **kwargs)
@@ -473,37 +473,6 @@ def __init__(
         )
 
 
-class SmolLM2(ModelMetadata):
-    """SmolLM2 models (e.g., HuggingFaceTB/SmolLM2-135M).
-    Potentially used for testing."""
-
-    def __init__(
-        self, model_name_or_path: str, tokenizer=None, rope_scaling_factor: float | None = None, **kwargs
-    ) -> None:
-        tokenizer = AutoTokenizer.from_pretrained(model_name_or_path) if tokenizer is None else tokenizer
-        config = AutoConfig.from_pretrained(model_name_or_path)
-        if rope_scaling_factor:
-            logger.warning(
-                f"Rope scaling factor {rope_scaling_factor} is not supported for Mistral due to longer default context lengths. Ignoring."
-            )
-
-        super().__init__(
-            autoconfig=config,
-            instruction=DEFAULT_INSTRUCTION,
-            prompt_config=LLMPromptConfig.from_tokenizer(
-                template="user\n {instruction} {schema} \n assistant\n{prefill}",
-                add_bos_token_to_prompt=False,
-                add_eos_token_to_prompt=False,
-                tokenizer=tokenizer,
-                name=model_name_or_path,
-            ),
-            model_name_or_path=model_name_or_path,
-            rope_scaling=None,
-            rope_parameters_location="autoconfig",
-            **kwargs,
-        )
-
-
 class SmolLM3(ModelMetadata):
     def __init__(
         self, model_name_or_path: str, tokenizer=None, rope_scaling_factor: float | None = None, **kwargs
diff --git a/tests/llm/test_metadata.py b/tests/llm/test_metadata.py
@@ -34,7 +34,6 @@
     Nemotron,
     Qwen,
     RopeScaling,
-    SmolLM2,
     SmolLM3,
     TinyLlama,
     resolve_rope_scaling_factor,
@@ -87,7 +86,6 @@ class RopeScalingScenario:
     ModelDetectionScenario("tinyllama", "TinyLlama/TinyLlama-1.1B-Chat-v1.0", TinyLlama),
     ModelDetectionScenario("qwen", "Qwen/Qwen2-0.5B", Qwen),
     ModelDetectionScenario("llama32", "meta-llama/Llama32-1B", Llama32),
-    ModelDetectionScenario("smollm2", "HuggingFaceTB/SmolLM2-135M", SmolLM2),
     ModelDetectionScenario("smollm3", "HuggingFaceTB/SmolLM3-3B", SmolLM3),
     ModelDetectionScenario("mistral", "mistralai/Mistral-7B-v0.1", Mistral),
     ModelDetectionScenario("nemotron", "nvidia/Nemotron-4-340B", Nemotron),
@@ -125,17 +123,6 @@ class RopeScalingScenario:
         expected_bos_token="<|im_start|>",
         expected_bos_token_id=151644,
     ),
-    ModelInitScenario(
-        id="smollm2",
-        model_class=SmolLM2,
-        model_path="HuggingFaceTB/SmolLM2-135M",
-        expected_template="user\n {instruction} {schema} \n assistant\n{prefill}",
-        expected_add_bos=False,
-        expected_add_eos=False,
-        expected_bos_token="<|im_start|>",
-        expected_bos_token_id=151644,
-        custom_max_position_embeddings=8192,
-    ),
     ModelInitScenario(
         id="smollm3",
         model_class=SmolLM3,
diff --git a/tests/smoke/README.md b/tests/smoke/README.md
@@ -1,158 +1,44 @@
-# Fast Smoke Tests for Training and Generation Hot Paths
+# Smoke Tests
 
-Smoke tests exercising training and generation hot paths with a tiny model.
-Run CPU tests without GPU (`make test-smoke`), GPU tests via `make test-gpu-integration`.
+Quick tests that verify training and generation code paths don't crash.
+They use tiny or small models and run in seconds (CPU) or a few minutes (GPU).
 
-## Shared Infrastructure (`conftest.py`)
-
-Key fixtures and helpers available to all smoke tests:
-
-- **`base_smoke_config`** -- `SafeSynthesizerParameters` with local tiny model defaults
-- **`train_with_sdk(config, data_df, save_path)`** -- runs `process_data().train()`, returns the `SafeSynthesizer` instance
-- **`assert_adapter_saved(workdir)`** -- asserts `adapter_config.json` + `*.safetensors` exist
-- **`_patch_attn_eager`** -- monkeypatches `attn_implementation` to `"eager"` for tiny model compatibility
-- **`tiny_model`** / **`stub_tokenizer`** / **`tiny_training_dataset`** -- CPU test primitives
-- **`local_tinyllama_dir`** -- local model directory for GPU tests (no internet needed)
-- **`iris_df`** / **`timeseries_df`** -- small stub datasets
-
-## Design Origin
-
-This test suite was organized into **self-contained work units (WUs)** that were delegated independently. WU1 and WU2 (infrastructure and fixtures) were done first. After that, WU3-WU11 were done in parallel, then consolidated in WU13.
-
-## Dependency Graph and Parallel Execution Strategy
-
-There are only two sequential dependencies: WU1 -> WU2 (foundation). After that, **all remaining WUs are fully independent** -- no test file reads output from another test file. Each GPU test does its own training internally.
-
-```mermaid
-flowchart TD
-    subgraph phase1 ["Phase 1 -- Foundation (sequential, do first)"]
-        WU1["WU1: Infrastructure"] --> WU2["WU2: Shared Fixtures"]
-    end
-
-    subgraph phase2 ["Phase 2 -- All parallelizable (no inter-dependencies)"]
-        direction TB
-        batchA["Batch A: WU0 README"]
-        batchB["Batch B: WU3 CPU Training + WU4 CPU Generation"]
-        batchC["Batch C: WU5 GPU Training + WU10 GPU Adapter Persistence"]
-        batchD["Batch D: WU6 GPU Generation + WU8 GPU Structured Gen"]
-        batchE["Batch E: WU7 GPU Timeseries"]
-        batchF["Batch F: WU9 GPU Resume"]
-        batchG["Batch G: WU11 GPU Full Pipeline + WU12 Unsloth"]
-    end
-
-    subgraph phase3 ["Phase 3 -- Consolidation (fresh agent, after all Phase 2)"]
-        WU13["WU13: DRY pass -- deduplicate, extract helpers, consolidate"]
-    end
-
-    WU2 --> batchA
-    WU2 --> batchB
-    WU2 --> batchC
-    WU2 --> batchD
-    WU2 --> batchE
-    WU2 --> batchF
-    WU2 --> batchG
-
-    batchA --> WU13
-    batchB --> WU13
-    batchC --> WU13
-    batchD --> WU13
-    batchE --> WU13
-    batchF --> WU13
-    batchG --> WU13
+```bash
+make test-smoke             # CPU only, no GPU needed
+make test-gpu-integration   # GPU tests (requires CUDA)
 ```
 
+## When should I add a smoke test?
 
+If you're adding a new training backend, generation backend, or model family,
+add a smoke test for it. Same if you're changing how the SDK orchestrates
+train/generate -- those paths are easy to break silently.
 
-### Recommended Delegation Batches
-
-WU3-WU11 are grouped by **skill similarity** so each assignee has minimal context-switching:
-
-
-| Batch       | WUs             | Why grouped                                                       | Skills needed                                     | Size                      |
-| ----------- | --------------- | ----------------------------------------------------------------- | ------------------------------------------------- | ------------------------- |
-| **Phase 1** | WU0 + WU1 + WU2 | Sequential foundation; one person does all setup                  | pytest fixtures, basic infra                      | ~30 min                   |
-| **B**       | WU3 + WU4       | Both CPU-only, similar Trainer/generate patterns                  | HF Trainer, peft, Opacus, NSS assembler/processor | Medium (2 files, 7 tests) |
-| **C**       | WU5 + WU10      | Both train via SDK then inspect adapter output                    | SafeSynthesizer SDK, PEFT adapter loading         | Medium (2 files, 5 tests) |
-| **D**       | WU6 + WU8       | Both exercise vLLM generation paths                               | VllmBackend, vLLM, structured outputs             | Medium (2 files, 3 tests) |
-| **E**       | WU7             | Specialized timeseries knowledge                                  | TimeseriesBackend, timeseries config              | Small (1 file, 1 test)    |
-| **F**       | WU9             | Specialized resume/Workdir knowledge                              | SafeSynthesizer resume, load_from_save_path       | Small (1 file, 1 test)    |
-| **G**       | WU11 + WU12     | Both need internet + HF Hub; WU12 needs process isolation from DP | SmolLM2, Unsloth, HF Hub, Makefile update         | Medium (2 files, 3 tests) |
-
-
-### Priority order (if fewer hands available)
+Smoke tests don't check output quality. They just make sure the code runs
+end-to-end without throwing. Use the smallest model that exercises the path
+(the local `tiny_llama` stub for most things, SmolLM2-135M when you need
+a real tokenizer/model).
 
-### Phase 3: Consolidation (sequential, after all Phase 2 batches complete)
+## Things that will bite you
 
+- **LoRA rank must be 8** (not 4). vLLM silently rejects rank 4. Use `lora_r=8`.
+- **Iris only has 151 rows**, but holdout needs >=200. Set `holdout=0, max_holdout=0` to skip it.
+- **Attention implementation**: HuggingFaceBackend defaults to `flashinfer`, which HF doesn't recognize. The `_patch_attn_eager` fixture overrides it to `"sdpa"`.
+- **Stub tokenizer vocab is 32000**. If you change the tiny model config, keep `vocab_size=32000` or you'll get shape mismatches.
+- **Always set `use_unsloth=False`** unless you're specifically testing Unsloth. The `auto` default can pull it in and it monkey-patches transformers globally.
+- **CPU tests need `optim="adamw_torch"`**. The production default (`paged_adamw_32bit`) requires bitsandbytes CUDA kernels.
+- **Unsloth tests run in a separate process**. Unsloth patches transformers at import time, which breaks Opacus/DP if they share a process. The Makefile handles this automatically.
 
-| Batch | WU   | Purpose                                                                         | Owner                                              |
-| ----- | ---- | ------------------------------------------------------------------------------- | -------------------------------------------------- |
-| **H** | WU13 | Holistic DRY pass: deduplicate configs, extract helpers, consolidate decorators | **Fresh agent** (must NOT have worked on WU3-WU12) |
+## What's in `conftest.py`?
 
+The shared fixtures cover both CPU and GPU smoke tests. The most important ones:
 
-### Priority order (if fewer hands available)
+- `base_smoke_config` -- default `SafeSynthesizerParameters` pointing at the local tiny model
+- `train_with_sdk(config, data_df, save_path)` -- convenience wrapper around the SDK train flow
+- `assert_adapter_saved(workdir)` -- checks that adapter files landed on disk
+- `_patch_attn_eager` -- the attention implementation workaround mentioned above
+- `tiny_model`, `stub_tokenizer`, `tiny_training_dataset` -- CPU test building blocks
+- `local_tinyllama_dir` -- saves the tiny model to a temp dir so GPU tests don't need internet
+- `iris_df`, `timeseries_df` -- small DataFrames for training input
 
-If you cannot parallelize all batches, do them in this order (highest value first):
-
-1. **Phase 1** (A) -- must go first
-2. **C** (GPU SDK training + adapter) -- highest signal for catching regressions
-3. **D** (GPU generation + structured) -- tests the production generation path
-4. **B** (CPU training + generation) -- catches dep breakage without GPU
-5. **E** (timeseries) -- specialized but important path
-6. **F** (resume) -- important production flow
-7. **G** (SmolLM2 + Unsloth) -- lowest priority, needs internet; WU12 also needs Makefile update for process isolation
-8. **H** (consolidation) -- must be last; requires fresh eyes
-
----
-
-## Critical Gotchas (every WU must know these)
-
-These were discovered by automated council review and affect ALL work units:
-
-1. **Copyright headers**: Every new `.py` file MUST start with:
-  ```python
-   # SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
-   # SPDX-License-Identifier: Apache-2.0
-  ```
-   Enforced by `tools/lint/copyright_fixer.py --check .` in CI.
-2. `**lora_r=8` not 4**: vLLM only allows LoRA ranks in {1, 8, 16, 32, 64, ...}. Rank 4 is silently rejected. Use 8 everywhere in smoke tests.
-3. `**holdout=0, max_holdout=0**`: The iris dataset has 151 rows, but `Holdout.train_test_split()` in `src/nemo_safe_synthesizer/holdout/holdout.py` requires >=200 rows. Setting holdout=0 bypasses this.
-4. `**attn_implementation="eager"**`: The `HuggingFaceBackend` defaults to `flash_attention_2` which can fail with head_dim=32 (our tiny model: hidden_size=64 / 2 heads). Override to `"eager"` in smoke tests.
-5. `**vocab_size=32000**`: The stub tokenizer at `tests/stub_tokenizer/` has 32000 tokens. The tiny model config must match this exactly.
-6. `**use_unsloth=False**`: Always set explicitly. The `auto` default may resolve to `True` and pull in Unsloth, which invasively patches transformers.
-7. `**optim="adamw_torch"**` for CPU tests: The production default `paged_adamw_32bit` requires bitsandbytes CUDA kernels.
-
----
-
-## Summary: File Inventory
-
-
-| File                                              | WU   | Tests              | Marker                      |
-| ------------------------------------------------- | ---- | ------------------ | --------------------------- |
-| `tests/smoke/README.md`                           | WU0  | -- (documentation) | --                          |
-| `tests/smoke/__init__.py`                         | WU1  | --                 | --                          |
-| `tests/smoke/conftest.py`                         | WU2  | -- (fixtures only) | --                          |
-| `tests/smoke/test_training_cpu.py`                | WU3  | 4                  | `smoke` (auto)              |
-| `tests/smoke/test_generation_cpu.py`              | WU4  | 3                  | `smoke` (auto)              |
-| `tests/smoke/test_nss_training_gpu.py`            | WU5  | 2                  | `smoke` + `gpu_integration` |
-| `tests/smoke/test_nss_generation_gpu.py`          | WU6  | 2                  | `smoke` + `gpu_integration` |
-| `tests/smoke/test_nss_timeseries_gpu.py`          | WU7  | 1                  | `smoke` + `gpu_integration` |
-| `tests/smoke/test_nss_structured_gen_gpu.py`      | WU8  | 1                  | `smoke` + `gpu_integration` |
-| `tests/smoke/test_nss_resume_gpu.py`              | WU9  | 1                  | `smoke` + `gpu_integration` |
-| `tests/smoke/test_nss_adapter_persistence_gpu.py` | WU10 | 3                  | `smoke` + `gpu_integration` |
-| `tests/smoke/test_full_pipeline_gpu.py`           | WU11 | 2                  | `smoke` + `gpu_integration` |
-| `tests/smoke/test_nss_unsloth_gpu.py`             | WU12 | 1                  | `smoke` + `gpu_integration` |
-
-
-**Modified files**: `tests/conftest.py`, `pytest.ini`, `Makefile`
-
-**Total**: 21 tests across 10 test files, plus 2 infra files (conftest, init) and 1 README.
-
-## Running
-
-```bash
-# CPU smoke tests only (~10 seconds, no GPU required)
-make test-smoke
-
-# GPU smoke + e2e tests (requires CUDA)
-make test-gpu-integration
-```
+See [CONTRIBUTING.md](../../CONTRIBUTING.md#testing) for the full list of test commands.
diff --git a/tests/smoke/conftest.py b/tests/smoke/conftest.py
diff --git a/tests/smoke/test_full_pipeline_gpu.py b/tests/smoke/test_full_pipeline_gpu.py
diff --git a/tests/smoke/test_generation_cpu.py b/tests/smoke/test_generation_cpu.py
diff --git a/tests/smoke/test_nss_generation_gpu.py b/tests/smoke/test_nss_generation_gpu.py