[fix] Switch to nixl package and exclude nixl-cu13; Fix cleanup in test_inference_server_group (NovaSky-AI#1788)

SumanthRH · web-flow · commit c4eaded781fd · 2026-06-14T21:24:49.000-07:00
# What does this PR do? Fixes GPU CI failures for `test_engine_generation.py::test_pd_generation`, `test_engine_generation.py::test_pd_generation_non_colocated[1P1D_non_colocated]` and `test_inference_server_group.py::TestServerGroupAndRouter::test_pause_resume` 1. The P/D tests fail with `NIXL is not available`. The root cause is that we only use `nixl-cu12` but vllm imports `_api` from the `nixl` PyPI package. We had multiple fixes touch this package recently. github.com/NovaSky-AI/pull/1756 excluded nixl-cu13 from the toml. We noticed some CI errors and NovaSky-AI#1759 changed it to just installing `nixl-cu12` directly instead of excluding `nixl-cu13`. The failure is fixed by reverting NovaSky-AI#1759. 2. `test_inference_server_group.py::TestServerGroupAndRouter::test_pause_resume` fails with `RuntimeError: There is no current event loop in thread 'MainThread'` ## Test Plan - GPU CI run [before](https://console.anyscale.com/cld_hxkifz7xa22mwicp21nzkds1lw/prj_4b6c498rypyq6g7yhk6vzgjevt/jobs/prodjob_35xpydkq7qmlnef89t3xk4z7zh?job-tab=overview&job-logs-section-tabs=application_logs) <details> <summary>Failures</summary> ```bash =========================== short test summary info ============================ FAILED tests/backends/skyrl_train/gpu/gpu_ci/inference_servers/test_weight_sync_moe.py::test_worker_wrap_load_weights_preserves_moe_forward - RuntimeError: Engine core initialization failed. See root cause above. Failed core proc(s): {} FAILED tests/backends/skyrl_train/gpu/gpu_ci/inference_servers/test_weight_sync_moe.py::test_worker_wrap_multichunk_reload_preserves_moe_forward - RuntimeError: Engine core initialization failed. See root cause above. Failed core proc(s): {} FAILED tests/backends/skyrl_train/gpu/gpu_ci/integrations/test_pd_routing.py::test_pd_routing_verification - ray.exceptions.RayTaskError(RuntimeError): ray::VLLMServerActor.start() (pid=42443, ip=10.1.137.178, actor_id=5be14503010a8f7d42ae78fe1d000000, repr=<skyrl.backends.skyrl_train.inference_servers.vllm_server_actor.VLLMServerActor object at 0x7ee87d605910>) File "/home/ray/anaconda3/lib/python3.12/concurrent/futures/_base.py", line 449, in result return self.__get_result() ^^^^^^^^^^^^^^^^^^^ File "/home/ray/anaconda3/lib/python3.12/concurrent/futures/_base.py", line 401, in __get_result raise self._exception ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ File "/tmp/ray/session_2026-06-14_18-52-12_482543_634/runtime_resources/working_dir_files/s3_anyscale-k8s-sky-anyscale-aws-us-east-1-e88d5dbd_org_xc6lv84h3d7m9dljcc17esfw2i_cld_hxkifz7xa22mwicp21nzkds1lw_runtime_env_packages_pkg_991c7596ffe950b0d97128cdcfd34c93/skyrl/backends/skyrl_train/inference_servers/vllm_server_actor.py", line 279, in start await self._wait_until_healthy() ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ File "/tmp/ray/session_2026-06-14_18-52-12_482543_634/runtime_resources/working_dir_files/s3_anyscale-k8s-sky-anyscale-aws-us-east-1-e88d5dbd_org_xc6lv84h3d7m9dljcc17esfw2i_cld_hxkifz7xa22mwicp21nzkds1lw_runtime_env_packages_pkg_991c7596ffe950b0d97128cdcfd34c93/skyrl/backends/skyrl_train/inference_servers/vllm_server_actor.py", line 294, in _wait_until_healthy raise exc ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ File "/tmp/ray/session_2026-06-14_18-52-12_482543_634/runtime_resources/working_dir_files/s3_anyscale-k8s-sky-anyscale-aws-us-east-1-e88d5dbd_org_xc6lv84h3d7m9dljcc17esfw2i_cld_hxkifz7xa22mwicp21nzkds1lw_runtime_env_packages_pkg_991c7596ffe950b0d97128cdcfd34c93/skyrl/backends/skyrl_train/inference_servers/vllm_server_actor.py", line 335, in _run_server self._engine = AsyncLLMEngine.from_engine_args( ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ File "/home/ray/.cache/uv/builds-v0/.tmpBksJm4/lib/python3.12/site-packages/vllm/v1/engine/async_llm.py", line 246, in from_engine_args return cls( ^^^^ File "/home/ray/.cache/uv/builds-v0/.tmpBksJm4/lib/python3.12/site-packages/vllm/v1/engine/async_llm.py", line 146, in __init__ self.engine_core = EngineCoreClient.make_async_mp_client( ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ File "/home/ray/.cache/uv/builds-v0/.tmpBksJm4/lib/python3.12/site-packages/vllm/tracing/otel.py", line 178, in sync_wrapper return func(*args, **kwargs) ^^^^^^^^^^^^^^^^^^^^^ File "/home/ray/.cache/uv/builds-v0/.tmpBksJm4/lib/python3.12/site-packages/vllm/v1/engine/core_client.py", line 130, in make_async_mp_client return AsyncMPClient(*client_args) ^^^^^^^^^^^^^^^^^^^^^^^^^^^ File "/home/ray/.cache/uv/builds-v0/.tmpBksJm4/lib/python3.12/site-packages/vllm/tracing/otel.py", line 178, in sync_wrapper return func(*args, **kwargs) ^^^^^^^^^^^^^^^^^^^^^ File "/home/ray/.cache/uv/builds-v0/.tmpBksJm4/lib/python3.12/site-packages/vllm/v1/engine/core_client.py", line 900, in __init__ super().__init__( File "/home/ray/.cache/uv/builds-v0/.tmpBksJm4/lib/python3.12/site-packages/vllm/v1/engine/core_client.py", line 535, in __init__ with launch_core_engines( ^^^^^^^^^^^^^^^^^^^^ File "/home/ray/anaconda3/lib/python3.12/contextlib.py", line 144, in __exit__ next(self.gen) File "/home/ray/.cache/uv/builds-v0/.tmpBksJm4/lib/python3.12/site-packages/vllm/v1/engine/utils.py", line 1119, in launch_core_engines wait_for_engine_startup( File "/home/ray/.cache/uv/builds-v0/.tmpBksJm4/lib/python3.12/site-packages/vllm/v1/engine/utils.py", line 1178, in wait_for_engine_startup raise RuntimeError( RuntimeError: Engine core initialization failed. See root cause above. Failed core proc(s): {} FAILED tests/backends/skyrl_train/gpu/gpu_ci/test_engine_generation.py::test_pd_generation - ray.exceptions.RayTaskError(RuntimeError): ray::VLLMServerActor.start() (pid=59962, ip=10.1.137.178, actor_id=1b2ed6e627c06ebf3c298df228000000, repr=<skyrl.backends.skyrl_train.inference_servers.vllm_server_actor.VLLMServerActor object at 0x7ed25ead5970>) File "/home/ray/anaconda3/lib/python3.12/concurrent/futures/_base.py", line 449, in result return self.__get_result() ^^^^^^^^^^^^^^^^^^^ File "/home/ray/anaconda3/lib/python3.12/concurrent/futures/_base.py", line 401, in __get_result raise self._exception ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ File "/tmp/ray/session_2026-06-14_18-52-12_482543_634/runtime_resources/working_dir_files/s3_anyscale-k8s-sky-anyscale-aws-us-east-1-e88d5dbd_org_xc6lv84h3d7m9dljcc17esfw2i_cld_hxkifz7xa22mwicp21nzkds1lw_runtime_env_packages_pkg_991c7596ffe950b0d97128cdcfd34c93/skyrl/backends/skyrl_train/inference_servers/vllm_server_actor.py", line 279, in start await self._wait_until_healthy() ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ File "/tmp/ray/session_2026-06-14_18-52-12_482543_634/runtime_resources/working_dir_files/s3_anyscale-k8s-sky-anyscale-aws-us-east-1-e88d5dbd_org_xc6lv84h3d7m9dljcc17esfw2i_cld_hxkifz7xa22mwicp21nzkds1lw_runtime_env_packages_pkg_991c7596ffe950b0d97128cdcfd34c93/skyrl/backends/skyrl_train/inference_servers/vllm_server_actor.py", line 294, in _wait_until_healthy raise exc ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ File "/tmp/ray/session_2026-06-14_18-52-12_482543_634/runtime_resources/working_dir_files/s3_anyscale-k8s-sky-anyscale-aws-us-east-1-e88d5dbd_org_xc6lv84h3d7m9dljcc17esfw2i_cld_hxkifz7xa22mwicp21nzkds1lw_runtime_env_packages_pkg_991c7596ffe950b0d97128cdcfd34c93/skyrl/backends/skyrl_train/inference_servers/vllm_server_actor.py", line 335, in _run_server self._engine = AsyncLLMEngine.from_engine_args( ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ File "/home/ray/.cache/uv/builds-v0/.tmpVdttC1/lib/python3.12/site-packages/vllm/v1/engine/async_llm.py", line 246, in from_engine_args return cls( ^^^^ File "/home/ray/.cache/uv/builds-v0/.tmpVdttC1/lib/python3.12/site-packages/vllm/v1/engine/async_llm.py", line 146, in __init__ self.engine_core = EngineCoreClient.make_async_mp_client( ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ File "/home/ray/.cache/uv/builds-v0/.tmpVdttC1/lib/python3.12/site-packages/vllm/tracing/otel.py", line 178, in sync_wrapper return func(*args, **kwargs) ^^^^^^^^^^^^^^^^^^^^^ File "/home/ray/.cache/uv/builds-v0/.tmpVdttC1/lib/python3.12/site-packages/vllm/v1/engine/core_client.py", line 130, in make_async_mp_client return AsyncMPClient(*client_args) ^^^^^^^^^^^^^^^^^^^^^^^^^^^ File "/home/ray/.cache/uv/builds-v0/.tmpVdttC1/lib/python3.12/site-packages/vllm/tracing/otel.py", line 178, in sync_wrapper return func(*args, **kwargs) ^^^^^^^^^^^^^^^^^^^^^ File "/home/ray/.cache/uv/builds-v0/.tmpVdttC1/lib/python3.12/site-packages/vllm/v1/engine/core_client.py", line 900, in __init__ super().__init__( File "/home/ray/.cache/uv/builds-v0/.tmpVdttC1/lib/python3.12/site-packages/vllm/v1/engine/core_client.py", line 535, in __init__ with launch_core_engines( ^^^^^^^^^^^^^^^^^^^^ File "/home/ray/anaconda3/lib/python3.12/contextlib.py", line 144, in __exit__ next(self.gen) File "/home/ray/.cache/uv/builds-v0/.tmpVdttC1/lib/python3.12/site-packages/vllm/v1/engine/utils.py", line 1119, in launch_core_engines wait_for_engine_startup( File "/home/ray/.cache/uv/builds-v0/.tmpVdttC1/lib/python3.12/site-packages/vllm/v1/engine/utils.py", line 1178, in wait_for_engine_startup raise RuntimeError( RuntimeError: Engine core initialization failed. See root cause above. Failed core proc(s): {} FAILED tests/backends/skyrl_train/gpu/gpu_ci/test_engine_generation.py::test_pd_generation_non_colocated[1P1D_non_colocated] - ray.exceptions.RayTaskError(RuntimeError): ray::VLLMServerActor.start() (pid=62434, ip=10.1.137.178, actor_id=f66e78b0c51b929ba4940ac52b000000, repr=<skyrl.backends.skyrl_train.inference_servers.vllm_server_actor.VLLMServerActor object at 0x7fa02f4e4950>) File "/home/ray/anaconda3/lib/python3.12/concurrent/futures/_base.py", line 449, in result return self.__get_result() ^^^^^^^^^^^^^^^^^^^ File "/home/ray/anaconda3/lib/python3.12/concurrent/futures/_base.py", line 401, in __get_result raise self._exception ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ File "/tmp/ray/session_2026-06-14_18-52-12_482543_634/runtime_resources/working_dir_files/s3_anyscale-k8s-sky-anyscale-aws-us-east-1-e88d5dbd_org_xc6lv84h3d7m9dljcc17esfw2i_cld_hxkifz7xa22mwicp21nzkds1lw_runtime_env_packages_pkg_991c7596ffe950b0d97128cdcfd34c93/skyrl/backends/skyrl_train/inference_servers/vllm_server_actor.py", line 279, in start await self._wait_until_healthy() ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ File "/tmp/ray/session_2026-06-14_18-52-12_482543_634/runtime_resources/working_dir_files/s3_anyscale-k8s-sky-anyscale-aws-us-east-1-e88d5dbd_org_xc6lv84h3d7m9dljcc17esfw2i_cld_hxkifz7xa22mwicp21nzkds1lw_runtime_env_packages_pkg_991c7596ffe950b0d97128cdcfd34c93/skyrl/backends/skyrl_train/inference_servers/vllm_server_actor.py", line 294, in _wait_until_healthy raise exc ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ File "/tmp/ray/session_2026-06-14_18-52-12_482543_634/runtime_resources/working_dir_files/s3_anyscale-k8s-sky-anyscale-aws-us-east-1-e88d5dbd_org_xc6lv84h3d7m9dljcc17esfw2i_cld_hxkifz7xa22mwicp21nzkds1lw_runtime_env_packages_pkg_991c7596ffe950b0d97128cdcfd34c93/skyrl/backends/skyrl_train/inference_servers/vllm_server_actor.py", line 335, in _run_server self._engine = AsyncLLMEngine.from_engine_args( ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ File "/home/ray/.cache/uv/builds-v0/.tmp2XwpZ8/lib/python3.12/site-packages/vllm/v1/engine/async_llm.py", line 246, in from_engine_args return cls( ^^^^ File "/home/ray/.cache/uv/builds-v0/.tmp2XwpZ8/lib/python3.12/site-packages/vllm/v1/engine/async_llm.py", line 146, in __init__ self.engine_core = EngineCoreClient.make_async_mp_client( ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ File "/home/ray/.cache/uv/builds-v0/.tmp2XwpZ8/lib/python3.12/site-packages/vllm/tracing/otel.py", line 178, in sync_wrapper return func(*args, **kwargs) ^^^^^^^^^^^^^^^^^^^^^ File "/home/ray/.cache/uv/builds-v0/.tmp2XwpZ8/lib/python3.12/site-packages/vllm/v1/engine/core_client.py", line 130, in make_async_mp_client return AsyncMPClient(*client_args) ^^^^^^^^^^^^^^^^^^^^^^^^^^^ File "/home/ray/.cache/uv/builds-v0/.tmp2XwpZ8/lib/python3.12/site-packages/vllm/tracing/otel.py", line 178, in sync_wrapper return func(*args, **kwargs) ^^^^^^^^^^^^^^^^^^^^^ File "/home/ray/.cache/uv/builds-v0/.tmp2XwpZ8/lib/python3.12/site-packages/vllm/v1/engine/core_client.py", line 900, in __init__ super().__init__( File "/home/ray/.cache/uv/builds-v0/.tmp2XwpZ8/lib/python3.12/site-packages/vllm/v1/engine/core_client.py", line 535, in __init__ with launch_core_engines( ^^^^^^^^^^^^^^^^^^^^ File "/home/ray/anaconda3/lib/python3.12/contextlib.py", line 144, in __exit__ next(self.gen) File "/home/ray/.cache/uv/builds-v0/.tmp2XwpZ8/lib/python3.12/site-packages/vllm/v1/engine/utils.py", line 1119, in launch_core_engines wait_for_engine_startup( File "/home/ray/.cache/uv/builds-v0/.tmp2XwpZ8/lib/python3.12/site-packages/vllm/v1/engine/utils.py", line 1178, in wait_for_engine_startup raise RuntimeError( RuntimeError: Engine core initialization failed. See root cause above. Failed core proc(s): {} FAILED tests/backends/skyrl_train/gpu/gpu_ci/test_skyrl_gym_generator.py::test_generator_formatting_no_use_conversation_multi_turn[unsloth/Llama-3.2-1B-Instruct] - assert 0 == 1 + where 0 = sum(<generator object test_generator_formatting_no_use_conversation_multi_turn.<locals>.<genexpr> at 0x7f8f014d7b90>) ERROR tests/backends/skyrl_train/gpu/gpu_ci/inference_servers/test_inference_server_group.py::TestServerGroupAndRouter::test_pause_resume - RuntimeError: There is no current event loop in thread 'MainThread'. ERROR tests/backends/skyrl_train/gpu/gpu_ci/inference_servers/test_weight_sync.py::TestWeightUpdateFlow::test_update_weights_flow[pd_1P1D_non_colocated] - ray.exceptions.RayTaskError(RuntimeError): ray::VLLMServerActor.start() (pid=37642, ip=10.1.137.178, actor_id=c5ef1351b990537fa794fc7c15000000, repr=<skyrl.backends.skyrl_train.inference_servers.vllm_server_actor.VLLMServerActor object at 0x7f8c35395820>) File "/home/ray/anaconda3/lib/python3.12/concurrent/futures/_base.py", line 449, in result return self.__get_result() ^^^^^^^^^^^^^^^^^^^ File "/home/ray/anaconda3/lib/python3.12/concurrent/futures/_base.py", line 401, in __get_result raise self._exception ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ File "/tmp/ray/session_2026-06-14_18-52-12_482543_634/runtime_resources/working_dir_files/s3_anyscale-k8s-sky-anyscale-aws-us-east-1-e88d5dbd_org_xc6lv84h3d7m9dljcc17esfw2i_cld_hxkifz7xa22mwicp21nzkds1lw_runtime_env_packages_pkg_991c7596ffe950b0d97128cdcfd34c93/skyrl/backends/skyrl_train/inference_servers/vllm_server_actor.py", line 279, in start await self._wait_until_healthy() ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ File "/tmp/ray/session_2026-06-14_18-52-12_482543_634/runtime_resources/working_dir_files/s3_anyscale-k8s-sky-anyscale-aws-us-east-1-e88d5dbd_org_xc6lv84h3d7m9dljcc17esfw2i_cld_hxkifz7xa22mwicp21nzkds1lw_runtime_env_packages_pkg_991c7596ffe950b0d97128cdcfd34c93/skyrl/backends/skyrl_train/inference_servers/vllm_server_actor.py", line 294, in _wait_until_healthy raise exc ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ File "/tmp/ray/session_2026-06-14_18-52-12_482543_634/runtime_resources/working_dir_files/s3_anyscale-k8s-sky-anyscale-aws-us-east-1-e88d5dbd_org_xc6lv84h3d7m9dljcc17esfw2i_cld_hxkifz7xa22mwicp21nzkds1lw_runtime_env_packages_pkg_991c7596ffe950b0d97128cdcfd34c93/skyrl/backends/skyrl_train/inference_servers/vllm_server_actor.py", line 335, in _run_server self._engine = AsyncLLMEngine.from_engine_args( ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ File "/home/ray/.cache/uv/builds-v0/.tmpg2XhWJ/lib/python3.12/site-packages/vllm/v1/engine/async_llm.py", line 246, in from_engine_args return cls( ^^^^ File "/home/ray/.cache/uv/builds-v0/.tmpg2XhWJ/lib/python3.12/site-packages/vllm/v1/engine/async_llm.py", line 146, in __init__ self.engine_core = EngineCoreClient.make_async_mp_client( ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ File "/home/ray/.cache/uv/builds-v0/.tmpg2XhWJ/lib/python3.12/site-packages/vllm/tracing/otel.py", line 178, in sync_wrapper return func(*args, **kwargs) ^^^^^^^^^^^^^^^^^^^^^ File "/home/ray/.cache/uv/builds-v0/.tmpg2XhWJ/lib/python3.12/site-packages/vllm/v1/engine/core_client.py", line 130, in make_async_mp_client return AsyncMPClient(*client_args) ^^^^^^^^^^^^^^^^^^^^^^^^^^^ File "/home/ray/.cache/uv/builds-v0/.tmpg2XhWJ/lib/python3.12/site-packages/vllm/tracing/otel.py", line 178, in sync_wrapper return func(*args, **kwargs) ^^^^^^^^^^^^^^^^^^^^^ File "/home/ray/.cache/uv/builds-v0/.tmpg2XhWJ/lib/python3.12/site-packages/vllm/v1/engine/core_client.py", line 900, in __init__ super().__init__( File "/home/ray/.cache/uv/builds-v0/.tmpg2XhWJ/lib/python3.12/site-packages/vllm/v1/engine/core_client.py", line 535, in __init__ with launch_core_engines( ^^^^^^^^^^^^^^^^^^^^ File "/home/ray/anaconda3/lib/python3.12/contextlib.py", line 144, in __exit__ next(self.gen) File "/home/ray/.cache/uv/builds-v0/.tmpg2XhWJ/lib/python3.12/site-packages/vllm/v1/engine/utils.py", line 1119, in launch_core_engines wait_for_engine_startup( File "/home/ray/.cache/uv/builds-v0/.tmpg2XhWJ/lib/python3.12/site-packages/vllm/v1/engine/utils.py", line 1178, in wait_for_engine_startup raise RuntimeError( RuntimeError: Engine core initialization failed. See root cause above. Failed core proc(s): {} = 6 failed, 98 passed, 20 skipped, 23 deselected, 575 warnings, 2 errors in 7228.81s (2:00:28) = *** SIGTERM received at time=1781470490 on cpu 163 *** PC: @ 0x7f97393a48fb (unknown) syscall @ 0x7f958e5baa89 (unknown) signal_hook_registry::handler::h7f516a97b139a5d0 {"asctime":"2026-06-14 20:54:50,465","levelname":"E","message":"*** SIGTERM received at time=1781470490 on cpu 163 ***","filename":"logging.cc","lineno":474} {"asctime":"2026-06-14 20:54:50,465","levelname":"E","message":"PC: @ 0x7f97393a48fb (unknown) syscall","filename":"logging.cc","lineno":474} {"asctime":"2026-06-14 20:54:50,465","levelname":"E","message":" @ 0x7f958e5baa89 (unknown) signal_hook_registry::handler::h7f516a97b139a5d0","filename":"logging.cc","lineno":474} 2026-06-14 20:54:50 INFO vllm_router_rs::server: src/server.rs:1080: Received terminate signal, starting graceful shutdown 2026-06-14 20:54:50 WARN vllm_router_rs::middleware: src/middleware.rs:461: Concurrency queue processor shutting down sys:1: DeprecationWarning: builtin type swigvarlink has no __module__ attribute ``` </details> - GPU CI run [after](https://console.anyscale.com/cld_hxkifz7xa22mwicp21nzkds1lw/prj_4b6c498rypyq6g7yhk6vzgjevt/jobs/prodjob_m71sz4nf9jc1qig4xecgrrt2xq?job-tab=overview&job-logs-section-tabs=application_logs) <details> <summary>Failures</summary> ```bash =========================== short test summary info ============================ FAILED tests/backends/skyrl_train/gpu/gpu_ci/inference_servers/test_weight_sync_moe.py::test_worker_wrap_load_weights_preserves_moe_forward - RuntimeError: Engine core initialization failed. See root cause above. Failed core proc(s): {} ====== 1 failed, 33 passed, 6 skipped, 108 warnings in 1251.71s (0:20:51) ====== sys:1: DeprecationWarning: builtin type swigvarlink has no __module__ attribute ``` </details> --------- Signed-off-by: SumanthRH <sumanthrh99@gmail.com>
diff --git a/pyproject.toml b/pyproject.toml
@@ -97,14 +97,22 @@ skyrl-train = [
     "vllm-router; sys_platform == 'linux'",
     "pybind11",
     "setuptools",
-    "nixl-cu12; sys_platform == 'linux'",
+    # The `nixl` shim provides that namespace and dispatches to `nixl_cu12`.
+    # `nixl-cu12` ships the `nixl_cu12` module, but vLLM imports `nixl._api`.
+    # Its metadata hard-depends on `nixl-cu13` too; that variant is overridden
+    # out below (it would drag in the CUDA-13 stack and break the cu12 torch pin).
+    "nixl; sys_platform == 'linux'",
 ]
 
 fsdp = [
     "skyrl[skyrl-train]",
     "vllm==0.20.2; sys_platform == 'linux'",
     "vllm-router; sys_platform == 'linux'",
-    "nixl-cu12; sys_platform == 'linux'",
+    # The `nixl` shim provides that namespace and dispatches to `nixl_cu12`.
+    # `nixl-cu12` ships the `nixl_cu12` module, but vLLM imports `nixl._api`.
+    # Its metadata hard-depends on `nixl-cu13` too; that variant is overridden
+    # out below (it would drag in the CUDA-13 stack and break the cu12 torch pin).
+    "nixl; sys_platform == 'linux'",
     "flash-linear-attention; sys_platform == 'linux'",
     "causal-conv1d; sys_platform == 'linux'",
     "flash-attn==2.8.3; sys_platform == 'linux'",
@@ -124,7 +132,11 @@ megatron = [
     "mamba-ssm>=2.3.0; sys_platform == 'linux'",
     "vllm==0.20.2; sys_platform == 'linux'",
     "vllm-router; sys_platform == 'linux'",
-    "nixl-cu12; sys_platform == 'linux'",
+    # The `nixl` shim provides that namespace and dispatches to `nixl_cu12`.
+    # `nixl-cu12` ships the `nixl_cu12` module, but vLLM imports `nixl._api`.
+    # Its metadata hard-depends on `nixl-cu13` too; that variant is overridden
+    # out below (it would drag in the CUDA-13 stack and break the cu12 torch pin).
+    "nixl; sys_platform == 'linux'",
     "torch==2.11.0; sys_platform == 'linux'",
     "flashinfer-python==0.6.8.post1; sys_platform == 'linux' and platform_machine == 'x86_64'",
     "torchvision; sys_platform == 'linux'",
@@ -224,7 +236,10 @@ override-dependencies = [
     "transformers>=5.6.1,<=5.8.0; sys_platform == 'linux'",
     "megatron-core>=0.16.0; sys_platform == 'linux' and python_version >= '3.12'",
     "ml_dtypes>=0.5.0; sys_platform == 'linux'",
-    "transformer-engine-cu13; sys_platform == 'never'"
+    "transformer-engine-cu13; sys_platform == 'never'",
+    # `nixl` hard-depends on both nixl-cu12 and nixl-cu13; drop the cu13 variant
+    # so it doesn't pull the CUDA-13 stack and bump torch off the cu12 pin.
+    "nixl-cu13; sys_platform == 'never'"
 ]
 
 [tool.uv.extra-build-dependencies]
diff --git a/tests/backends/skyrl_train/gpu/gpu_ci/inference_servers/test_inference_server_group.py b/tests/backends/skyrl_train/gpu/gpu_ci/inference_servers/test_inference_server_group.py
@@ -118,7 +118,9 @@ def server_group_and_router(class_scoped_ray_init_fixture):
         "client": client,
     }
 
-    asyncio.get_event_loop().run_until_complete(client.teardown())
+    # Teardown runs after the class-scoped event loop is gone, so create a
+    # fresh one. `asyncio.get_event_loop()` raises on py3.12 when no loop is set.
+    asyncio.run(client.teardown())
     router.shutdown()
     group.shutdown()
 
diff --git a/uv.lock b/uv.lock

Original file line number	Diff line number	Diff line change
`@@ -118,7 +118,9 @@ def server_group_and_router(class_scoped_ray_init_fixture):`
`118`	`118`	`"client": client,`
`119`	`119`	`}`
`120`	`120`
`121`		`- asyncio.get_event_loop().run_until_complete(client.teardown())`
	`121`	`+ # Teardown runs after the class-scoped event loop is gone, so create a`
	`122`	+ # fresh one. `asyncio.get_event_loop()` raises on py3.12 when no loop is set.
	`123`	`+ asyncio.run(client.teardown())`
`122`	`124`	`router.shutdown()`
`123`	`125`	`group.shutdown()`
`124`	`126`