chore: try to use kernels lib to get flash attention kernel (#30)

binaryaaron · web-flow · commit e871be4abe92 · 2026-02-23T19:22:16.000-07:00
In an attempt to ease some deps and open up more potential paths for
easier benchmarking, this let's us specify flash attention
implementations/backends on the main config objects for both training
and generation.

There's some typing-related changes in this too that affect the artifact
structure's internals. I'm going to do a follow up to fix _a lot_ of
typing issues throughout the repo soon; i have a draft in progress.

---------

Signed-off-by: Aaron Gonzales &lt;aagonzales@nvidia.com&gt;
Signed-off-by: aagonzales &lt;aagonzales@nvidia.com&gt;
diff --git a/Makefile b/Makefile
@@ -39,7 +39,7 @@ $(info local system architecture: $(PLATFORM)/$(ARCH))
 .PHONY: help
 help:
 	@echo "Makefile commands:"
-	@grep -E '^[a-zA-Z_-]+:.*?## .*$$' $(MAKEFILE_LIST) | sort | awk 'BEGIN {FS = ":.*?## "}; {printf "\033[36m%-30s\033[0m %s\n", $$1, $$2}'
+	@grep -E '^[a-zA-Z0-9_-]+:.*?## .*$$' $(MAKEFILE_LIST) | sort | awk 'BEGIN {FS = ":.*?## "}; {printf "\033[36m%-30s\033[0m %s\n", $$1, $$2}'
 
 
 ### BOOTSTRAP AND SETUP ###
diff --git a/README.md b/README.md
@@ -191,6 +191,47 @@ Commands:
   validate  Validate a Safe Synthesizer configuration.
 ```
 
+## Attention Configuration
+
+Safe Synthesizer exposes attention implementation settings for both training and generation.
+
+### Training (`attn_implementation`)
+
+Controls the HuggingFace attention backend used during model loading for training. Set via config YAML, CLI, or SDK:
+
+```yaml
+# config.yaml
+training:
+  attn_implementation: "kernels-community/vllm-flash-attn3"
+```
+
+```bash
+# CLI override
+safe-synthesizer run --training__attn_implementation sdpa --url my_data.csv
+```
+
+| Value | Description | Requires |
+|-------|-------------|----------|
+| `kernels-community/vllm-flash-attn3` | Flash Attention 3 via HuggingFace Kernels Hub (default) | `kernels` pip package |
+| `kernels-community/flash-attn2` | Flash Attention 2 via HuggingFace Kernels Hub | `kernels` pip package |
+| `flash_attention_2` | Flash Attention 2 (traditional) | `flash-attn` pip package |
+| `sdpa` | PyTorch scaled dot product attention | None (built-in) |
+| `eager` | Standard PyTorch attention | None (built-in) |
+
+If the default `kernels-community/vllm-flash-attn3` is configured but the `kernels` package is not installed, the backend automatically falls back to `sdpa`.
+
+### Generation (`attention_backend`)
+
+Controls the vLLM attention backend used during synthetic data generation. Defaults to `"auto"`, which lets vLLM auto-select the best available backend.
+
+```yaml
+# config.yaml
+generation:
+  attention_backend: "FLASH_ATTN"
+```
+
+Common values: `FLASHINFER`, `FLASH_ATTN`, `TORCH_SDPA`, `TRITON_ATTN`, `FLEX_ATTENTION`.
+
 ## Artifacts and Workdirs
 
 Safe Synthesizer uses a structured directory format to manage artifacts (trained models, synthetic data, logs).
diff --git a/src/nemo_safe_synthesizer/cli/artifact_structure.py b/src/nemo_safe_synthesizer/cli/artifact_structure.py
@@ -5,6 +5,7 @@
 
 from __future__ import annotations
 
+import os
 from dataclasses import dataclass, field
 from datetime import datetime
 from pathlib import Path
@@ -196,11 +197,11 @@ def __get__(self, obj: object | None, objtype: type | None = None) -> DirNode |
                 raise TypeError(f"DirNode can only be used with BoundDir or Workdir, got {type(obj)}")
 
 
-class BoundDir:
+class BoundDir(os.PathLike[str]):
     """Runtime class representing a bound directory path.
 
-    Provides access to child FileNode and DirNode descriptors as attributes,
-    and implements __fspath__ for use with os.path functions.
+    Provides access to child FileNode and DirNode descriptors as attributes.
+    Implements os.PathLike[str] so instances can be used wherever paths are expected.
     """
 
     def __init__(self, path: Path, children: dict[str, FileNode | DirNode]):
@@ -240,12 +241,6 @@ def __eq__(self, other: object) -> bool:
     def __hash__(self) -> int:
         return hash(self._path)
 
-    def __getattribute__(self, name: str) -> Path | BoundDir:
-        # Allow access to special methods, private attrs, and the path property
-        if name.startswith("_") or name == "path":
-            return super().__getattribute__(name)
-        return self.__getattr__(name)
-
     def __getattr__(self, name: str) -> Path | BoundDir:
         if name.startswith("_"):
             raise AttributeError(f"'{type(self).__name__}' has no attribute '{name}'")
diff --git a/src/nemo_safe_synthesizer/config/generate.py b/src/nemo_safe_synthesizer/config/generate.py
@@ -75,6 +75,8 @@ class GenerateParameters(Parameters, BaseModel):
         patience: Number of invalid records fraction before stopping.
         invalid_fraction_threshold: "The fraction of invalid records that will stop generation after the `patience` limit is reached."
         use_structured_generation: Whether to use structured generation for better format control.
+        attention_backend: The attention backend for the vLLM engine. If None, vLLM will
+            auto-select the best available backend.
 
     """
 
@@ -179,3 +181,15 @@ class GenerateParameters(Parameters, BaseModel):
         description="Validation parameters controlling validation logic and automatic fixes when parsing LLM output and converting to tabular data.",
         default_factory=ValidationParameters,
     )
+
+    attention_backend: Annotated[
+        str | None,
+        Field(
+            title="attention_backend",
+            description=(
+                "The attention backend for the vLLM engine. Common values: 'FLASHINFER', "
+                "'FLASH_ATTN', 'TRITON_ATTN', 'FLEX_ATTENTION'. "
+                "If None or 'auto', vLLM will auto-select the best available backend."
+            ),
+        ),
+    ] = "auto"
diff --git a/src/nemo_safe_synthesizer/config/training.py b/src/nemo_safe_synthesizer/config/training.py
@@ -79,6 +79,10 @@ class TrainingHyperparams(Parameters):
         peft_implementation: The PEFT (Parameter-Efficient Fine-Tuning) implementation to use.
             Options include 'lora' for Low-Rank Adaptation, QLoRA for Quantized LoRA. Each method has its own trade-offs in terms of performance
             and resource requirements.
+        attn_implementation: The attention implementation to use for model loading.
+            Default uses Flash Attention 3 via the HuggingFace Kernels Hub. Falls back to 'sdpa'
+            if the kernels package is not installed. Other common values include 'flash_attention_2',
+            'sdpa', and 'eager'.
     """
 
     num_input_records_to_sample: Annotated[
@@ -285,3 +289,19 @@ class TrainingHyperparams(Parameters):
             description="The fraction of the total VRAM to use for training. Default is 0.9. Modify this to allow longer sequences to be used.",
         ),
     ] = 0.80
+
+    attn_implementation: Annotated[
+        str,
+        Field(
+            title="attn_implementation",
+            description=(
+                "The attention implementation to use for model loading. "
+                "Default uses Flash Attention 3 via the HuggingFace Kernels Hub "
+                "(requires the 'kernels' pip package; falls back to 'sdpa' if unavailable). "
+                "Other common values: 'flash_attention_2' (requires flash-attn pip package), "
+                "'sdpa' (PyTorch scaled dot product attention), 'eager' (standard PyTorch). "
+                "Custom HuggingFace Kernels Hub paths (e.g. 'kernels-community/flash-attn2') "
+                "are also supported."
+            ),
+        ),
+    ] = "kernels-community/vllm-flash-attn3"
diff --git a/src/nemo_safe_synthesizer/defaults.py b/src/nemo_safe_synthesizer/defaults.py
@@ -92,3 +92,5 @@
 EPS = 1e-15
 NUM_SPECIAL_TOKENS = 2
 DEFAULT_CACHE_PREFIX = "safe-synthesizer-dataset-cache"
+DEFAULT_ATTN_IMPLEMENTATION = "kernels-community/vllm-flash-attn3"
+BACKUP_ATTN_IMPLEMENTATION = "sdpa"
diff --git a/src/nemo_safe_synthesizer/generation/vllm_backend.py b/src/nemo_safe_synthesizer/generation/vllm_backend.py
@@ -81,8 +81,14 @@ def __del__(self) -> None:
 
     def initialize(self, **kwargs) -> None:
         """Initialize and load the model into memory."""
-        max_vram = get_max_vram(as_fraction=True)
-        max_vram = max_vram.get(0)
+        # vLLM 0.11.x uses an environment variable for attention backend selection.
+        # When vLLM is upgraded to 0.12+, migrate to the attention_backend constructor arg.
+        if self.config.generation.attention_backend not in [None, "auto"]:
+            os.environ["VLLM_ATTENTION_BACKEND"] = self.config.generation.attention_backend
+
+        max_vram = get_max_vram()
+        # note this only works for single GPU setups
+        max_vram = max_vram.get(0, 0.8)
 
         # vllm requires this "config" to set the backend ahead of time.
         structured_outputs_config = StructuredOutputsConfig(
@@ -91,7 +97,7 @@ def initialize(self, **kwargs) -> None:
         )
         self.llm = vLLM(
             model=self.config.training.pretrained_model,
-            gpu_memory_utilization=float(max_vram),
+            gpu_memory_utilization=max_vram,
             enable_lora=True,
             max_lora_rank=self.config.training.lora_r,
             structured_outputs_config=structured_outputs_config,
diff --git a/src/nemo_safe_synthesizer/llm/utils.py b/src/nemo_safe_synthesizer/llm/utils.py
@@ -63,35 +63,30 @@ def round_gb(value: float) -> float:
     logger.info(f"GPU = {gpu_stats.name}. Max memory = {max_memory} GB.")
 
 
-def get_max_vram(
-    memory_fraction: float | None = None, as_string: bool = True, as_fraction: bool = False
-) -> dict[int, float | str]:
+def get_max_vram(max_vram_fraction: float | None = None) -> dict[int, float]:
     """
-    Calculate max memory allocation for each available GPU and CPU.
+    Calculate max memory allocation for each available GPU and CPU as a fraction of total GPU memory.
 
     Args:
         memory_fraction: Fraction of total GPU memory to allocate (default 0.8 for 80%)
 
     Returns:
-        Dictionary mapping device IDs to memory limits
+        Dictionary mapping device IDs to memory limits as a fraction of total GPU memory
     """
-    if memory_fraction is None:
-        memory_fraction = 0.8
+    if max_vram_fraction is None:
+        max_vram_fraction = 0.8
     max_memory = {}
 
     if torch.cuda.is_available():
         num_gpus = torch.cuda.device_count()
         for i in range(num_gpus):
             free, total = torch.cuda.mem_get_info(device=i)
             safe_free = free - (2 * 1024**3)
-            gpu_memory_utilization = min(memory_fraction, safe_free / total)
+            gpu_memory_utilization = min(max_vram_fraction, safe_free / total)
             memory_gib = gpu_memory_utilization * total / (1024**3)
-            if as_fraction:
-                max_memory[i] = gpu_memory_utilization
-            else:
-                max_memory[i] = memory_gib if not as_string else f"{memory_gib:.2f}GiB"
+            max_memory[i] = gpu_memory_utilization
             logger.info(
-                f"GPU {i}: Will allocate {memory_gib:.2f}GiB ({memory_fraction * 100}% of {total / (1024**3):.2f}GiB)"
+                f"GPU {i}: Will allocate {memory_gib:.2f}GiB ({max_vram_fraction * 100}% of {total / (1024**3):.2f}GiB)"
             )
 
     return max_memory
diff --git a/src/nemo_safe_synthesizer/training/backend.py b/src/nemo_safe_synthesizer/training/backend.py
@@ -13,7 +13,6 @@
 from datasets import Dataset
 from peft import PeftModel
 from transformers import (
-    AutoTokenizer,
     PreTrainedModel,
     PreTrainedTokenizer,
     Trainer,
@@ -50,7 +49,7 @@ class NSSTrainerResult:
 
 class TrainingBackend(metaclass=abc.ABCMeta):
     model: PreTrainedModel | PeftModel
-    tokenizer: AutoTokenizer | PreTrainedTokenizer
+    tokenizer: PreTrainedTokenizer
     quant_params: dict
     load_params: dict
     trainer_type: type[OpacusDPTrainer | Trainer | FastLanguageModel]
@@ -59,7 +58,7 @@ class TrainingBackend(metaclass=abc.ABCMeta):
     results: NSSTrainerResult
     training_examples: TrainingExamples
     df_train: pd.DataFrame
-    df_test: pd.DataFrame
+    df_test: pd.DataFrame | None
     dataset_schema: dict | None
     training_output_dir: Path
     workdir: Workdir
diff --git a/src/nemo_safe_synthesizer/training/huggingface_backend.py b/src/nemo_safe_synthesizer/training/huggingface_backend.py
diff --git a/src/nemo_safe_synthesizer/utils.py b/src/nemo_safe_synthesizer/utils.py
diff --git a/tests/training/test_huggingface_backend.py b/tests/training/test_huggingface_backend.py