fix: make vLLM backend teardown idempotent and remove keep_llm_state (#91)

binaryaaron · web-flow · commit 877e11c4a625 · 2026-03-18T21:41:12.000-06:00
Closes #88 - Merge `_clear_llm_state()` into idempotent `teardown()` with `_torn_down` guard - Remove `keep_llm_state` from `generate()` and base class - Add try/finally in SDK and CLI paths - Add shutdown lifecycle unit tests Made with [Cursor](https://cursor.com) --------- Signed-off-by: aagonzales <aagonzales@nvidia.com>
diff --git a/pyproject.toml b/pyproject.toml
@@ -326,6 +326,7 @@ safe-synthesizer = "nemo_safe_synthesizer.cli.cli:cli"
 # Below is a list of excluded directories from ty typechecks.
 exclude = [
     ".uv_cache", # Cache Dir in CI
+    "./docs/**/*.ipynb",
     "./src/nemo_safe_synthesizer/artifacts/",
     "./src/nemo_safe_synthesizer/pii_replacer/",
     "./tests/",
diff --git a/src/nemo_safe_synthesizer/cli/run.py b/src/nemo_safe_synthesizer/cli/run.py
@@ -219,11 +219,18 @@ def run(
         from ..sdk.library_builder import SafeSynthesizer
 
         ss: SafeSynthesizer = SafeSynthesizer(config=config, workdir=workdir).with_data_source(df)
-        ss.run()
-        ss.save_results(output_file=settings.output_file or workdir.output_file)
-        ss.results.summary.log_summary(run_logger)
-        ss.results.summary.timing.log_timing(run_logger)
-        ss.results.summary.log_wandb()
+        # ss.run() calls train + generate + evaluate. The generate step has its own try/finally,
+        # but train or evaluate failures leave the generator loaded; this guard ensures teardown
+        # on all exit paths of the full pipeline.
+        try:
+            ss.run()
+            ss.save_results(output_file=settings.output_file or workdir.output_file)
+            ss.results.summary.log_summary(run_logger)
+            ss.results.summary.timing.log_timing(run_logger)
+            ss.results.summary.log_wandb()
+        finally:
+            if hasattr(ss, "generator") and ss.generator is not None:
+                ss.generator.teardown()
 
 
 @run.command("train")
@@ -359,9 +366,18 @@ def run_generate(
         if df is not None:
             ss = ss.with_data_source(df)
 
-        ss = ss.load_from_save_path().process_data().generate().evaluate().save_results(output_file=final_output_file)
-        ss.generator.teardown()
-        ss.results.summary.log_summary(run_logger)
-        ss.results.summary.timing.log_timing(run_logger)
-        run_logger.info(f"Generation complete. Results saved to: {final_output_file}")
-        ss.results.summary.log_wandb()
+        try:
+            ss = (
+                ss.load_from_save_path()
+                .process_data()
+                .generate()
+                .evaluate()
+                .save_results(output_file=final_output_file)
+            )
+            ss.results.summary.log_summary(run_logger)
+            ss.results.summary.timing.log_timing(run_logger)
+            run_logger.info(f"Generation complete. Results saved to: {final_output_file}")
+            ss.results.summary.log_wandb()
+        finally:
+            if hasattr(ss, "generator") and ss.generator is not None:
+                ss.generator.teardown()
diff --git a/src/nemo_safe_synthesizer/generation/backend.py b/src/nemo_safe_synthesizer/generation/backend.py
@@ -102,7 +102,6 @@ def prepare_params(self, **kwargs) -> None:
     @abc.abstractmethod
     def generate(
         self,
-        keep_llm_state: bool = True,
         data_actions_fn: utils.DataActionsFn | None = None,
     ) -> GenerateJobResults:
         """Run the batch generation loop and return aggregated results.
@@ -116,9 +115,6 @@ def generate(
         batch.
 
         Args:
-            keep_llm_state: If ``True``, keep the model in GPU memory
-                after generation for potential reuse.  If ``False``,
-                GPU resources are freed immediately on completion.
             data_actions_fn: Optional post-processing / validation
                 function applied to each batch of generated records.
                 Typically reverses training-time preprocessing and
diff --git a/src/nemo_safe_synthesizer/generation/timeseries_backend.py b/src/nemo_safe_synthesizer/generation/timeseries_backend.py
@@ -856,7 +856,6 @@ def _retain_single_valid_response(self, batch: Batch) -> list[dict]:
 
     def generate(
         self,
-        keep_llm_state: bool = True,
         data_actions_fn: utils.DataActionsFn | None = None,
     ) -> GenerateJobResults:
         """Generate time-series tabular data using Nemo Safe Synthesizer.
@@ -872,7 +871,6 @@ def generate(
             seen during training (from model_metadata.initial_prefill).
 
         Args:
-            keep_llm_state: If True, keep the model in memory after generation.
             data_actions_fn: Optional function that takes a DataFrame and returns a modified DataFrame.
 
         Returns:
@@ -924,9 +922,6 @@ def generate(
         batches.job_complete()
         batches.log_status()
 
-        if not keep_llm_state:
-            self._clear_llm_state()
-
         generation_time_sec = time.monotonic() - generation_start
         self.elapsed_time = generation_time_sec
         self.gen_results = GenerateJobResults.from_batches(
diff --git a/src/nemo_safe_synthesizer/generation/vllm_backend.py b/src/nemo_safe_synthesizer/generation/vllm_backend.py
@@ -118,7 +118,13 @@ class VllmBackend(GeneratorBackend):
             avoid leaking sensitive data).
     """
 
-    def __init__(self, config: SafeSynthesizerParameters, model_metadata: ModelMetadata, workdir: Workdir, **kwargs):
+    def __init__(
+        self,
+        config: SafeSynthesizerParameters,
+        model_metadata: ModelMetadata,
+        workdir: Workdir,
+        **kwargs,
+    ):
         self.model_metadata = model_metadata
         self.config = config
         self.remote = False
@@ -140,34 +146,43 @@ def __init__(self, config: SafeSynthesizerParameters, model_metadata: ModelMetad
         self.processor = create_processor(self.schema, self.model_metadata, self.config)
         adapter_path = self.workdir.adapter_path if self.workdir.adapter_path else self.model_metadata.adapter_path
         self.lora_req = LoRARequest("lora", 1, str(adapter_path)) if adapter_path else None
+        self._torn_down = False
 
     def teardown(self) -> None:
-        """Release GPU memory and clean up distributed resources."""
-        self._clear_llm_state()
+        """Release GPU memory and distributed resources. Idempotent -- safe to call multiple times."""
+        if self._torn_down:
+            return
+        self._torn_down = True
+
+        try:
+            cleanup_dist_env_and_memory()
+        except Exception:
+            logger.debug("cleanup_dist_env_and_memory failed during teardown", exc_info=True)
 
-    def _clear_llm_state(self) -> None:
-        """Delete LLM state to free up GPU memory."""
-        cleanup_dist_env_and_memory()
-        # destroy_model_parallel()
         self.llm = None
-        logger.debug("Cleaned up LLM")
-        cleanup_memory()
-        logger.debug("Cleaned up memory")
+        self._gen_method = None
+        self.gen_method = None
+
+        try:
+            cleanup_memory()
+        except Exception:
+            logger.debug("cleanup_memory failed during teardown", exc_info=True)
 
     def __del__(self) -> None:
-        """Clean up resources on garbage collection to prevent shutdown warnings."""
+        """Clean up resources on garbage collection."""
         try:
-            self._clear_llm_state()
+            self.teardown()
         except Exception:
-            # Suppress errors during garbage collection to avoid masking other exceptions
             pass
 
     def initialize(self, **kwargs) -> None:
         """Initialize and load the model into memory."""
-        # vLLM 0.11.x uses an environment variable for attention backend selection.
-        # When vLLM is upgraded to 0.12+, migrate to the attention_backend constructor arg.
-        if self.config.generation.attention_backend not in [None, "auto"]:
-            os.environ["VLLM_ATTENTION_BACKEND"] = self.config.generation.attention_backend
+        self._torn_down = False
+
+        # vLLM 0.12+ accepts attention_config as a constructor arg (replaces the
+        # VLLM_ATTENTION_BACKEND env var used in 0.11.x).
+        attn_backend = self.config.generation.attention_backend
+        attention_config = {"backend": attn_backend} if attn_backend not in (None, "auto") else None
 
         max_vram = get_max_vram()
         # note this only works for single GPU setups
@@ -194,6 +209,7 @@ def initialize(self, **kwargs) -> None:
             max_lora_rank=self.config.training.lora_r,
             structured_outputs_config=structured_outputs_config,
             enforce_eager=enforce_eager,
+            attention_config=attention_config,
         )
 
     def _build_structured_output_params(self) -> StructuredOutputsParams | None:
@@ -455,7 +471,6 @@ def _log_batch_timing_and_progress(
 
     def generate(
         self,
-        keep_llm_state: bool = True,
         data_actions_fn: utils.DataActionsFn | None = None,
     ) -> GenerateJobResults:
         """Generate synthetic tabular data in batches until the target count is reached.
@@ -465,9 +480,6 @@ def generate(
         a stopping condition fires.
 
         Args:
-            keep_llm_state: If ``True``, keep the model in GPU memory after
-                generation for potential reuse.  The model is still freed
-                on garbage collection.
             data_actions_fn: Optional post-processing / validation function
                 applied to each batch of generated records.
 
@@ -529,9 +541,6 @@ def generate(
         batches.job_complete()
         batches.log_status()
 
-        if not keep_llm_state:
-            self._clear_llm_state()
-
         max_num_records = (
             self.config.generation.num_records
             if self.config.data.group_training_examples_by is None and batches.status == GenerationStatus.COMPLETE
diff --git a/src/nemo_safe_synthesizer/sdk/library_builder.py b/src/nemo_safe_synthesizer/sdk/library_builder.py
@@ -383,8 +383,11 @@ def generate(self) -> SafeSynthesizer:
                 config=self._nss_config, model_metadata=self._llm_metadata, workdir=self._workdir
             )
 
-        self.generator.initialize()
-        self.generator.generate(keep_llm_state=False)
+        try:
+            self.generator.initialize()
+            self.generator.generate()
+        finally:
+            self.generator.teardown()
         self._generated = True
         return self
 
diff --git a/tests/generation/test_vllm_shutdown.py b/tests/generation/test_vllm_shutdown.py
@@ -0,0 +1,130 @@
+# SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+
+"""Tests for the VllmBackend teardown lifecycle."""
+
+from unittest.mock import MagicMock, patch
+
+import pytest
+
+from nemo_safe_synthesizer.generation import vllm_backend as vllm_backend_mod
+from nemo_safe_synthesizer.generation.vllm_backend import VllmBackend
+
+
+@pytest.fixture
+def _mock_vllm_cleanup():
+    """Patch vLLM distributed cleanup so tests run without a GPU."""
+    with (
+        patch.object(vllm_backend_mod, "cleanup_dist_env_and_memory") as mock_dist,
+        patch.object(vllm_backend_mod, "cleanup_memory") as mock_mem,
+    ):
+        yield mock_dist, mock_mem
+
+
+@pytest.fixture
+def backend(_mock_vllm_cleanup, fixture_session_cache_dir):
+    """Create a VllmBackend with mocked dependencies."""
+    mock_metadata = MagicMock()
+    mock_metadata.adapter_path = None
+    mock_metadata.instruction = "Generate"
+    mock_metadata.prompt_config = MagicMock()
+    mock_metadata.prompt_config.template = "{instruction} {schema}"
+
+    mock_config = MagicMock()
+    # Pin branching fields so create_processor() selects TabularDataProcessor deterministically.
+    mock_config.time_series.is_timeseries = False
+    mock_config.data.group_training_examples_by = None
+    # Pin to a valid literal so StructuredOutputsConfig Pydantic validation passes in initialize().
+    mock_config.generation.structured_generation_backend = "xgrammar"
+    mock_config.generation.attention_backend = None
+
+    mock_workdir = MagicMock()
+    mock_workdir.schema_file = fixture_session_cache_dir / "schema.json"
+    mock_workdir.schema_file.parent.mkdir(parents=True, exist_ok=True)
+    mock_workdir.schema_file.write_text('{"properties": {"col_a": {"type": "string"}}}')
+    mock_workdir.adapter_path = None
+
+    return VllmBackend(config=mock_config, model_metadata=mock_metadata, workdir=mock_workdir)
+
+
+class TestTeardownIdempotency:
+    def test_first_teardown_runs_cleanup(self, backend, _mock_vllm_cleanup):
+        mock_dist, mock_mem = _mock_vllm_cleanup
+        backend.llm = MagicMock()
+
+        backend.teardown()
+
+        mock_dist.assert_called_once()
+        mock_mem.assert_called_once()
+        assert backend.llm is None
+        assert backend._torn_down is True
+
+    def test_second_teardown_is_noop(self, backend, _mock_vllm_cleanup):
+        mock_dist, mock_mem = _mock_vllm_cleanup
+
+        backend.teardown()
+        mock_dist.reset_mock()
+        mock_mem.reset_mock()
+
+        backend.teardown()
+
+        mock_dist.assert_not_called()
+        mock_mem.assert_not_called()
+
+    def test_initialize_resets_guard(self, backend, _mock_vllm_cleanup):
+        backend.teardown()
+        assert backend._torn_down is True
+
+        with patch.object(vllm_backend_mod, "vLLM"):
+            backend.initialize()
+
+        assert backend._torn_down is False
+
+
+class TestTeardownResilience:
+    def test_cleanup_memory_runs_even_if_dist_cleanup_fails(self, backend, _mock_vllm_cleanup):
+        mock_dist, mock_mem = _mock_vllm_cleanup
+        mock_dist.side_effect = RuntimeError("distributed cleanup failed")
+
+        backend.teardown()
+
+        mock_dist.assert_called_once()
+        mock_mem.assert_called_once()
+        assert backend.llm is None
+
+    def test_llm_cleared_even_if_dist_cleanup_fails(self, backend, _mock_vllm_cleanup):
+        mock_dist, _ = _mock_vllm_cleanup
+        mock_dist.side_effect = RuntimeError("boom")
+        backend.llm = MagicMock()
+
+        backend.teardown()
+
+        assert backend.llm is None
+
+
+class TestDunderDel:
+    def test_del_calls_teardown(self, backend, _mock_vllm_cleanup):
+        mock_dist, _ = _mock_vllm_cleanup
+
+        # Reset to isolate only this explicit __del__ call.
+        mock_dist.reset_mock()
+        backend.__del__()
+
+        mock_dist.assert_called_once()
+        assert backend._torn_down is True
+
+    def test_del_suppresses_exceptions(self, backend, _mock_vllm_cleanup):
+        mock_dist, _ = _mock_vllm_cleanup
+        mock_dist.side_effect = RuntimeError("boom")
+
+        backend.__del__()
+
+    def test_del_after_teardown_is_noop(self, backend, _mock_vllm_cleanup):
+        mock_dist, _ = _mock_vllm_cleanup
+
+        backend.teardown()
+        mock_dist.reset_mock()
+
+        backend.__del__()
+
+        mock_dist.assert_not_called()