[BugFix] Make the vLLM FP32 plugin opt-in so importing torchrl can't hijack a host vLLM (#3868)

vmoens · web-flow · commit e797c196687f · 2026-06-15T16:29:52.000+01:00
diff --git a/pyproject.toml b/pyproject.toml
@@ -240,6 +240,10 @@ ignore-words-list = "multicat,nd,splitted,te,uncompressible,dout"
 first_party_detection = false
 
 [project.entry-points."vllm.general_plugins"]
-# Ensure FP32 overrides are registered in all vLLM processes (main, workers, and
-# the registry subprocess) before resolving model classes.
-fp32_overrides = "torchrl.modules.llm.backends.vllm.vllm_plugin:register_fp32_overrides"
+# Auto-loaded by vLLM in every process (main, workers, and the registry
+# subprocess), but a NO-OP unless that process opted in via
+# TORCHRL_VLLM_FP32_OVERRIDES (set by torchrl's vLLM backend when
+# enable_fp32_output=True). The unique name avoids colliding with other projects'
+# vllm.general_plugins entries: vLLM keys discovered plugins by name, so a shared
+# name would silently drop one of them.
+torchrl_fp32_overrides = "torchrl.modules.llm.backends.vllm.vllm_plugin:register_fp32_overrides"
diff --git a/test/llm/test_vllm_plugin.py b/test/llm/test_vllm_plugin.py
@@ -8,7 +8,12 @@
 
 import pytest
 
-from torchrl.modules.llm.backends.vllm.vllm_plugin import FP32_MODEL_OVERRIDES
+from torchrl.modules.llm.backends.vllm.vllm_plugin import (
+    FP32_MODEL_OVERRIDES,
+    fp32_overrides_enabled,
+    FP32_OVERRIDES_ENV_VAR,
+    register_fp32_overrides,
+)
 
 
 @pytest.mark.parametrize("arch", sorted(FP32_MODEL_OVERRIDES))
@@ -23,3 +28,35 @@ def test_fp32_override_paths_importable(arch):
     module_path, _, class_name = FP32_MODEL_OVERRIDES[arch].partition(":")
     module = importlib.import_module(module_path)
     assert hasattr(module, class_name), FP32_MODEL_OVERRIDES[arch]
+
+
+@pytest.mark.parametrize(
+    "value,expected",
+    [
+        (None, False),
+        ("0", False),
+        ("", False),
+        ("no", False),
+        ("1", True),
+        ("true", True),
+        ("True", True),
+        ("yes", True),
+    ],
+)
+def test_fp32_overrides_enabled_reads_env(monkeypatch, value, expected):
+    if value is None:
+        monkeypatch.delenv(FP32_OVERRIDES_ENV_VAR, raising=False)
+    else:
+        monkeypatch.setenv(FP32_OVERRIDES_ENV_VAR, value)
+    assert fp32_overrides_enabled() is expected
+
+
+def test_register_fp32_overrides_is_noop_without_optin(monkeypatch):
+    """Without the opt-in, registration must do nothing -- and must not even
+    import vLLM. This is what lets another project install torchrl without its
+    vLLM ``ModelRegistry`` being mutated. Returning before the vLLM import keeps
+    the no-op path safe on machines with no vLLM at all.
+    """
+    monkeypatch.delenv(FP32_OVERRIDES_ENV_VAR, raising=False)
+    # Must not raise even where vLLM is absent (early return precedes the import).
+    register_fp32_overrides()
diff --git a/test/transforms/test_reward_transforms.py b/test/transforms/test_reward_transforms.py
@@ -552,9 +552,9 @@ def test_trans_parallel_env_check(self, maybe_fork_ParallelEnv):
             except RuntimeError:
                 pass
 
-    @pytest.mark.parametrize("has_in_keys,", [True, False])
+    @pytest.mark.parametrize("has_in_keys", [True, False])
     @pytest.mark.parametrize(
-        "reset_keys,", [[("some", "nested", "reset")], ["_reset"] * 3, None]
+        "reset_keys", [[("some", "nested", "reset")], ["_reset"] * 3, None]
     )
     def test_trans_multi_key(
         self, has_in_keys, reset_keys, n_workers=2, batch_size=(3, 2), max_steps=5
diff --git a/torchrl/modules/llm/backends/vllm/vllm_async.py b/torchrl/modules/llm/backends/vllm/vllm_async.py
@@ -28,6 +28,7 @@
 
 # Import RLvLLMEngine and shared utilities
 from .base import RLvLLMEngine
+from .vllm_plugin import FP32_OVERRIDES_ENV_VAR
 
 
 _has_vllm = True
@@ -1966,6 +1967,9 @@ def make_async_vllm_engine(
     # Set FP32 output environment variable if requested
     if enable_fp32_output:
         os.environ["VLLM_ENABLE_FP32_OUTPUT"] = "1"
+        # Opt the engine + its child vLLM processes into torchrl's FP32 model
+        # overrides (the general-plugin no-ops without this).
+        os.environ[FP32_OVERRIDES_ENV_VAR] = "1"
         torchrl_logger.info(
             "Enabled FP32 output for vLLM (VLLM_ENABLE_FP32_OUTPUT=1). "
             "This will use FP32 for the final output layer if the model supports it."
diff --git a/torchrl/modules/llm/backends/vllm/vllm_plugin.py b/torchrl/modules/llm/backends/vllm/vllm_plugin.py
@@ -5,20 +5,55 @@
 
 from __future__ import annotations
 
+import os
+
 from torchrl._utils import logger
 
+# Env var that opts a vLLM process into torchrl's FP32 model overrides. torchrl's
+# vLLM backend sets it (enable_fp32_output=True) before the engine and its
+# subprocesses start, so the overrides register in every vLLM process torchrl
+# owns -- and in none that it does not.
+FP32_OVERRIDES_ENV_VAR = "TORCHRL_VLLM_FP32_OVERRIDES"
+
 # Architecture name -> "module.path:ClassName" overrides registered with vLLM.
 # Each path must stay importable; test_vllm_plugin.py guards against drift.
 FP32_MODEL_OVERRIDES: dict[str, str] = {
     "Qwen3ForCausalLM": "torchrl.modules.llm.backends.vllm._models:Qwen3ForCausalLMFP32",
 }
 
 
+def fp32_overrides_enabled() -> bool:
+    """Whether this process opted into torchrl's vLLM FP32 model overrides."""
+    return os.environ.get(FP32_OVERRIDES_ENV_VAR, "0").lower() in ("1", "true", "yes")
+
+
 def register_fp32_overrides() -> None:
-    """Register FP32 overrides for vLLM models."""
+    """Register torchrl's FP32 vLLM model overrides -- only when opted in.
+
+    vLLM auto-loads this through the ``vllm.general_plugins`` entry point in
+    *every* vLLM process, so it must do nothing unless this process explicitly
+    asked for torchrl's overrides via ``TORCHRL_VLLM_FP32_OVERRIDES``. Otherwise
+    merely *installing* torchrl would mutate an unrelated project's vLLM
+    ``ModelRegistry`` -- replacing its model classes with torchrl's, which track
+    a newer vLLM API and would break an older host vLLM at logits time.
+    """
+    if not fp32_overrides_enabled():
+        return
+
     from vllm.model_executor.models.registry import ModelRegistry
 
     for arch, model_cls_path in FP32_MODEL_OVERRIDES.items():
         ModelRegistry.register_model(arch, model_cls_path)
 
-    logger.info("Registered Qwen3 FP32 model overrides")
+    logger.info("Registered torchrl FP32 vLLM model overrides")
+
+
+def enable_fp32_overrides() -> None:
+    """Opt this process and its child vLLM processes into torchrl's overrides.
+
+    Sets ``TORCHRL_VLLM_FP32_OVERRIDES`` so spawned vLLM workers and the registry
+    subprocess inherit the opt-in, then registers in-process. Call before
+    constructing a vLLM engine when you want torchrl's FP32 model overrides.
+    """
+    os.environ[FP32_OVERRIDES_ENV_VAR] = "1"
+    register_fp32_overrides()
diff --git a/torchrl/modules/llm/backends/vllm/vllm_sync.py b/torchrl/modules/llm/backends/vllm/vllm_sync.py
@@ -19,6 +19,7 @@
 from torchrl.modules.llm.utils import _cuda_visible_devices
 
 from .base import RLvLLMEngine
+from .vllm_plugin import FP32_OVERRIDES_ENV_VAR
 
 try:
     from vllm import LLM
@@ -424,6 +425,9 @@ def make_vllm_worker(
     # Set FP32 output environment variable if requested
     if enable_fp32_output:
         os.environ["VLLM_ENABLE_FP32_OUTPUT"] = "1"
+        # Opt the engine + its child vLLM processes into torchrl's FP32 model
+        # overrides (the general-plugin no-ops without this).
+        os.environ[FP32_OVERRIDES_ENV_VAR] = "1"
         torchrl_logger.info(
             "Enabled FP32 output for vLLM (VLLM_ENABLE_FP32_OUTPUT=1). "
             "This will use FP32 for the final output layer if the model supports it."