[BugFix] Fix FP32 override registration path in vLLM plugin (#3861)

vmoens · web-flow · commit ad8ea7fb8eaf · 2026-06-12T16:03:45.000+01:00
diff --git a/test/llm/test_vllm_plugin.py b/test/llm/test_vllm_plugin.py
@@ -0,0 +1,25 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+from __future__ import annotations
+
+import importlib
+
+import pytest
+
+from torchrl.modules.llm.backends.vllm.vllm_plugin import FP32_MODEL_OVERRIDES
+
+
+@pytest.mark.parametrize("arch", sorted(FP32_MODEL_OVERRIDES))
+def test_fp32_override_paths_importable(arch):
+    """Every registered override must point at an importable class.
+
+    vLLM resolves these "module.path:ClassName" strings lazily, so a stale
+    path is only discovered at server startup when vLLM inspects the
+    architecture. This test does not require vLLM: ``_models`` falls back to
+    placeholder classes when vLLM is absent, keeping the import path valid.
+    """
+    module_path, _, class_name = FP32_MODEL_OVERRIDES[arch].partition(":")
+    module = importlib.import_module(module_path)
+    assert hasattr(module, class_name), FP32_MODEL_OVERRIDES[arch]
diff --git a/torchrl/modules/llm/backends/vllm/vllm_async.py b/torchrl/modules/llm/backends/vllm/vllm_async.py
@@ -1928,7 +1928,7 @@ def make_async_vllm_engine(
         compile (bool, optional): Whether to enable model compilation for better performance. Defaults to True.
         enable_fp32_output (bool, optional): Whether to enable FP32 output for the final layer. Defaults to False.
             This can help with numerical stability for certain models. Requires model-specific support in
-            torchrl.modules.llm.backends._models.
+            torchrl.modules.llm.backends.vllm._models.
         tensor_parallel_size (int, optional): Number of devices to use, per replica. Defaults to None.
         data_parallel_size (int, optional): Number of data parallel groups to use. Defaults to None.
         pipeline_parallel_size (int, optional): Number of pipeline parallel groups to use. Defaults to None.
diff --git a/torchrl/modules/llm/backends/vllm/vllm_plugin.py b/torchrl/modules/llm/backends/vllm/vllm_plugin.py
@@ -7,16 +7,18 @@
 
 from torchrl._utils import logger
 
+# Architecture name -> "module.path:ClassName" overrides registered with vLLM.
+# Each path must stay importable; test_vllm_plugin.py guards against drift.
+FP32_MODEL_OVERRIDES: dict[str, str] = {
+    "Qwen3ForCausalLM": "torchrl.modules.llm.backends.vllm._models:Qwen3ForCausalLMFP32",
+}
+
 
 def register_fp32_overrides() -> None:
     """Register FP32 overrides for vLLM models."""
     from vllm.model_executor.models.registry import ModelRegistry
 
-    # ======= Register models here =======
-    # Register Qwen3 models with FP32 override
-    ModelRegistry.register_model(
-        "Qwen3ForCausalLM",
-        "torchrl.modules.llm.backends._models:Qwen3ForCausalLMFP32",
-    )
+    for arch, model_cls_path in FP32_MODEL_OVERRIDES.items():
+        ModelRegistry.register_model(arch, model_cls_path)
 
     logger.info("Registered Qwen3 FP32 model overrides")
diff --git a/torchrl/modules/llm/backends/vllm/vllm_sync.py b/torchrl/modules/llm/backends/vllm/vllm_sync.py
@@ -402,7 +402,7 @@ def make_vllm_worker(
         enforce_eager (bool, optional): Whether to enforce eager execution. Defaults to `False`.
         enable_fp32_output (bool, optional): Whether to enable FP32 output for the final layer. Defaults to False.
             This can help with numerical stability for certain models. Requires model-specific support in
-            torchrl.modules.llm.backends._models.
+            torchrl.modules.llm.backends.vllm._models.
         **kwargs: Additional arguments passed to vLLM.LLM.__init__.
 
     Returns: