|
| 1 | +# SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. |
| 2 | +# SPDX-License-Identifier: Apache-2.0 |
| 3 | + |
| 4 | +from pathlib import Path |
| 5 | + |
| 6 | +import pandas as pd |
| 7 | +import pytest |
| 8 | +from datasets import Dataset |
| 9 | +from transformers import AutoTokenizer, LlamaConfig, LlamaForCausalLM |
| 10 | + |
| 11 | + |
| 12 | +@pytest.fixture(scope="session") |
| 13 | +def fixture_stub_tokenizer_path() -> str: |
| 14 | + """Session-scoped override of the function-scoped fixture in tests/conftest.py.""" |
| 15 | + return str(Path(__file__).parent.parent / "stub_tokenizer") |
| 16 | + |
| 17 | + |
| 18 | +@pytest.fixture(scope="session") |
| 19 | +def tiny_llama_config(fixture_stub_tokenizer_path): |
| 20 | + """LlamaConfig with minimal dimensions for fast smoke testing.""" |
| 21 | + tokenizer = AutoTokenizer.from_pretrained(fixture_stub_tokenizer_path) |
| 22 | + return LlamaConfig( |
| 23 | + vocab_size=tokenizer.vocab_size, # 32000 -- must match stub tokenizer |
| 24 | + hidden_size=64, |
| 25 | + intermediate_size=128, |
| 26 | + num_hidden_layers=2, |
| 27 | + num_attention_heads=2, |
| 28 | + num_key_value_heads=2, |
| 29 | + max_position_embeddings=128, |
| 30 | + ) |
| 31 | + |
| 32 | + |
| 33 | +@pytest.fixture |
| 34 | +def tiny_model(tiny_llama_config): |
| 35 | + """Randomly initialized LlamaForCausalLM. Tiny (~few KB), no download.""" |
| 36 | + return LlamaForCausalLM(tiny_llama_config) |
| 37 | + |
| 38 | + |
| 39 | +@pytest.fixture(scope="session") |
| 40 | +def stub_tokenizer(fixture_stub_tokenizer_path): |
| 41 | + """Load the Llama stub tokenizer from tests/stub_tokenizer/.""" |
| 42 | + return AutoTokenizer.from_pretrained(fixture_stub_tokenizer_path) |
| 43 | + |
| 44 | + |
| 45 | +@pytest.fixture(scope="session") |
| 46 | +def tiny_training_dataset(stub_tokenizer): |
| 47 | + """~8 tokenized training examples as a datasets.Dataset.""" |
| 48 | + texts = [ |
| 49 | + '{"col1":"a","col2":"1"}', |
| 50 | + '{"col1":"b","col2":"2"}', |
| 51 | + '{"col1":"c","col2":"3"}', |
| 52 | + '{"col1":"d","col2":"4"}', |
| 53 | + '{"col1":"e","col2":"5"}', |
| 54 | + '{"col1":"f","col2":"6"}', |
| 55 | + '{"col1":"g","col2":"7"}', |
| 56 | + '{"col1":"h","col2":"8"}', |
| 57 | + ] |
| 58 | + tokenized = stub_tokenizer(texts, padding="max_length", truncation=True, max_length=64, return_tensors="np") |
| 59 | + return Dataset.from_dict( |
| 60 | + { |
| 61 | + "input_ids": tokenized["input_ids"].tolist(), |
| 62 | + "attention_mask": tokenized["attention_mask"].tolist(), |
| 63 | + "labels": tokenized["input_ids"].tolist(), # labels = input_ids for causal LM |
| 64 | + } |
| 65 | + ) |
| 66 | + |
| 67 | + |
| 68 | +@pytest.fixture(scope="session") |
| 69 | +def tiny_training_dataset_with_position_ids(tiny_training_dataset): |
| 70 | + """Training dataset with position_ids column, required by DataCollatorForPrivateTokenClassification.""" |
| 71 | + seq_len = len(tiny_training_dataset[0]["input_ids"]) |
| 72 | + position_ids = [list(range(seq_len))] * len(tiny_training_dataset) |
| 73 | + return tiny_training_dataset.add_column("position_ids", position_ids) |
| 74 | + |
| 75 | + |
| 76 | +@pytest.fixture(scope="session") |
| 77 | +def local_tinyllama_dir(tmp_path_factory, tiny_llama_config, stub_tokenizer): |
| 78 | + """Save tiny model + tokenizer to a local dir named with 'tinyllama' for NSS compatibility.""" |
| 79 | + local_dir = tmp_path_factory.mktemp("smoke-tinyllama-model") |
| 80 | + model = LlamaForCausalLM(tiny_llama_config) |
| 81 | + model.save_pretrained(local_dir) |
| 82 | + stub_tokenizer.save_pretrained(local_dir) |
| 83 | + return local_dir |
| 84 | + |
| 85 | + |
| 86 | +@pytest.fixture(scope="session") |
| 87 | +def iris_df(): |
| 88 | + """Load iris.csv from stub_datasets.""" |
| 89 | + return pd.read_csv(Path(__file__).parent.parent / "stub_datasets" / "iris.csv") |
| 90 | + |
| 91 | + |
| 92 | +@pytest.fixture(scope="session") |
| 93 | +def timeseries_df(): |
| 94 | + """Minimal timeseries stub: 2 groups, 5 rows each, elapsed_seconds.""" |
| 95 | + return pd.DataFrame( |
| 96 | + { |
| 97 | + "group_id": ["A", "A", "A", "A", "A", "B", "B", "B", "B", "B"], |
| 98 | + "elapsed_seconds": [0, 60, 120, 180, 240, 0, 60, 120, 180, 240], |
| 99 | + "value": [10, 20, 30, 40, 50, 100, 110, 120, 130, 140], |
| 100 | + } |
| 101 | + ) |
| 102 | + |
| 103 | + |
| 104 | +@pytest.fixture(scope="session") |
| 105 | +def smoke_save_path(tmp_path_factory): |
| 106 | + """Shared temp directory for Tier B (SmolLM2) train -> generate flow.""" |
| 107 | + return tmp_path_factory.mktemp("smoke-tier-b") |
| 108 | + |
| 109 | + |
| 110 | +@pytest.fixture |
| 111 | +def base_smoke_config(local_tinyllama_dir): |
| 112 | + """Base SafeSynthesizerParameters shared by all GPU smoke tests with local tiny model. |
| 113 | +
|
| 114 | + Individual tests override specific fields via SafeSynthesizerParameters.from_params(**overrides). |
| 115 | + """ |
| 116 | + from nemo_safe_synthesizer.config.parameters import SafeSynthesizerParameters |
| 117 | + |
| 118 | + return SafeSynthesizerParameters.from_params( |
| 119 | + enable_synthesis=True, |
| 120 | + enable_replace_pii=False, |
| 121 | + pretrained_model=str(local_tinyllama_dir), |
| 122 | + use_unsloth=False, |
| 123 | + num_input_records_to_sample=10, |
| 124 | + num_records=5, |
| 125 | + lora_r=8, |
| 126 | + holdout=0, |
| 127 | + max_holdout=0, |
| 128 | + ) |
| 129 | + |
| 130 | + |
| 131 | +def assert_adapter_saved(workdir): |
| 132 | + """Verify adapter files exist after training. |
| 133 | +
|
| 134 | + Reusable assertion helper for any test that trains via the SDK. |
| 135 | + """ |
| 136 | + adapter_dir = workdir.train.adapter.path |
| 137 | + assert (adapter_dir / "adapter_config.json").exists(), "adapter_config.json missing" |
| 138 | + assert any(adapter_dir.glob("*.safetensors")), "No safetensors files found" |
| 139 | + |
| 140 | + |
| 141 | +def train_with_sdk(config, data_df, save_path): |
| 142 | + """Run SafeSynthesizer.process_data().train() and return the instance.""" |
| 143 | + from nemo_safe_synthesizer.sdk.library_builder import SafeSynthesizer |
| 144 | + |
| 145 | + nss = SafeSynthesizer(config=config, save_path=save_path) |
| 146 | + nss.with_data_source(data_df).process_data().train() |
| 147 | + return nss |
| 148 | + |
| 149 | + |
| 150 | +@pytest.fixture |
| 151 | +def _patch_attn_eager(monkeypatch): |
| 152 | + """Override attn_implementation to 'eager' for tiny model compatibility. |
| 153 | +
|
| 154 | + The HuggingFaceBackend defaults to 'flashinfer' which can fail with |
| 155 | + head_dim=32 (our tiny model: hidden_size=64 / 2 heads). |
| 156 | + """ |
| 157 | + from nemo_safe_synthesizer.training.huggingface_backend import HuggingFaceBackend |
| 158 | + |
| 159 | + original = HuggingFaceBackend._build_base_framework_params |
| 160 | + |
| 161 | + def patched(self, model_kwargs): |
| 162 | + model_kwargs.setdefault("attn_implementation", "eager") |
| 163 | + return original(self, model_kwargs) |
| 164 | + |
| 165 | + monkeypatch.setattr(HuggingFaceBackend, "_build_base_framework_params", patched) |
0 commit comments