Skip to content

Commit c4eaded

Browse files
authored
[fix] Switch to nixl package and exclude nixl-cu13; Fix cleanup in test_inference_server_group (NovaSky-AI#1788)
# What does this PR do? Fixes GPU CI failures for `test_engine_generation.py::test_pd_generation`, `test_engine_generation.py::test_pd_generation_non_colocated[1P1D_non_colocated]` and `test_inference_server_group.py::TestServerGroupAndRouter::test_pause_resume` 1. The P/D tests fail with `NIXL is not available`. The root cause is that we only use `nixl-cu12` but vllm imports `_api` from the `nixl` PyPI package. We had multiple fixes touch this package recently. github.com/NovaSky-AI/pull/1756 excluded nixl-cu13 from the toml. We noticed some CI errors and NovaSky-AI#1759 changed it to just installing `nixl-cu12` directly instead of excluding `nixl-cu13`. The failure is fixed by reverting NovaSky-AI#1759. 2. `test_inference_server_group.py::TestServerGroupAndRouter::test_pause_resume` fails with `RuntimeError: There is no current event loop in thread 'MainThread'` ## Test Plan - GPU CI run [before](https://console.anyscale.com/cld_hxkifz7xa22mwicp21nzkds1lw/prj_4b6c498rypyq6g7yhk6vzgjevt/jobs/prodjob_35xpydkq7qmlnef89t3xk4z7zh?job-tab=overview&job-logs-section-tabs=application_logs) <details> <summary>Failures</summary> ```bash =========================== short test summary info ============================ FAILED tests/backends/skyrl_train/gpu/gpu_ci/inference_servers/test_weight_sync_moe.py::test_worker_wrap_load_weights_preserves_moe_forward - RuntimeError: Engine core initialization failed. See root cause above. Failed core proc(s): {} FAILED tests/backends/skyrl_train/gpu/gpu_ci/inference_servers/test_weight_sync_moe.py::test_worker_wrap_multichunk_reload_preserves_moe_forward - RuntimeError: Engine core initialization failed. See root cause above. Failed core proc(s): {} FAILED tests/backends/skyrl_train/gpu/gpu_ci/integrations/test_pd_routing.py::test_pd_routing_verification - ray.exceptions.RayTaskError(RuntimeError): ray::VLLMServerActor.start() (pid=42443, ip=10.1.137.178, actor_id=5be14503010a8f7d42ae78fe1d000000, repr=<skyrl.backends.skyrl_train.inference_servers.vllm_server_actor.VLLMServerActor object at 0x7ee87d605910>) File "/home/ray/anaconda3/lib/python3.12/concurrent/futures/_base.py", line 449, in result return self.__get_result() ^^^^^^^^^^^^^^^^^^^ File "/home/ray/anaconda3/lib/python3.12/concurrent/futures/_base.py", line 401, in __get_result raise self._exception ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ File "/tmp/ray/session_2026-06-14_18-52-12_482543_634/runtime_resources/working_dir_files/s3_anyscale-k8s-sky-anyscale-aws-us-east-1-e88d5dbd_org_xc6lv84h3d7m9dljcc17esfw2i_cld_hxkifz7xa22mwicp21nzkds1lw_runtime_env_packages_pkg_991c7596ffe950b0d97128cdcfd34c93/skyrl/backends/skyrl_train/inference_servers/vllm_server_actor.py", line 279, in start await self._wait_until_healthy() ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ File "/tmp/ray/session_2026-06-14_18-52-12_482543_634/runtime_resources/working_dir_files/s3_anyscale-k8s-sky-anyscale-aws-us-east-1-e88d5dbd_org_xc6lv84h3d7m9dljcc17esfw2i_cld_hxkifz7xa22mwicp21nzkds1lw_runtime_env_packages_pkg_991c7596ffe950b0d97128cdcfd34c93/skyrl/backends/skyrl_train/inference_servers/vllm_server_actor.py", line 294, in _wait_until_healthy raise exc ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ File "/tmp/ray/session_2026-06-14_18-52-12_482543_634/runtime_resources/working_dir_files/s3_anyscale-k8s-sky-anyscale-aws-us-east-1-e88d5dbd_org_xc6lv84h3d7m9dljcc17esfw2i_cld_hxkifz7xa22mwicp21nzkds1lw_runtime_env_packages_pkg_991c7596ffe950b0d97128cdcfd34c93/skyrl/backends/skyrl_train/inference_servers/vllm_server_actor.py", line 335, in _run_server self._engine = AsyncLLMEngine.from_engine_args( ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ File "/home/ray/.cache/uv/builds-v0/.tmpBksJm4/lib/python3.12/site-packages/vllm/v1/engine/async_llm.py", line 246, in from_engine_args return cls( ^^^^ File "/home/ray/.cache/uv/builds-v0/.tmpBksJm4/lib/python3.12/site-packages/vllm/v1/engine/async_llm.py", line 146, in __init__ self.engine_core = EngineCoreClient.make_async_mp_client( ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ File "/home/ray/.cache/uv/builds-v0/.tmpBksJm4/lib/python3.12/site-packages/vllm/tracing/otel.py", line 178, in sync_wrapper return func(*args, **kwargs) ^^^^^^^^^^^^^^^^^^^^^ File "/home/ray/.cache/uv/builds-v0/.tmpBksJm4/lib/python3.12/site-packages/vllm/v1/engine/core_client.py", line 130, in make_async_mp_client return AsyncMPClient(*client_args) ^^^^^^^^^^^^^^^^^^^^^^^^^^^ File "/home/ray/.cache/uv/builds-v0/.tmpBksJm4/lib/python3.12/site-packages/vllm/tracing/otel.py", line 178, in sync_wrapper return func(*args, **kwargs) ^^^^^^^^^^^^^^^^^^^^^ File "/home/ray/.cache/uv/builds-v0/.tmpBksJm4/lib/python3.12/site-packages/vllm/v1/engine/core_client.py", line 900, in __init__ super().__init__( File "/home/ray/.cache/uv/builds-v0/.tmpBksJm4/lib/python3.12/site-packages/vllm/v1/engine/core_client.py", line 535, in __init__ with launch_core_engines( ^^^^^^^^^^^^^^^^^^^^ File "/home/ray/anaconda3/lib/python3.12/contextlib.py", line 144, in __exit__ next(self.gen) File "/home/ray/.cache/uv/builds-v0/.tmpBksJm4/lib/python3.12/site-packages/vllm/v1/engine/utils.py", line 1119, in launch_core_engines wait_for_engine_startup( File "/home/ray/.cache/uv/builds-v0/.tmpBksJm4/lib/python3.12/site-packages/vllm/v1/engine/utils.py", line 1178, in wait_for_engine_startup raise RuntimeError( RuntimeError: Engine core initialization failed. See root cause above. Failed core proc(s): {} FAILED tests/backends/skyrl_train/gpu/gpu_ci/test_engine_generation.py::test_pd_generation - ray.exceptions.RayTaskError(RuntimeError): ray::VLLMServerActor.start() (pid=59962, ip=10.1.137.178, actor_id=1b2ed6e627c06ebf3c298df228000000, repr=<skyrl.backends.skyrl_train.inference_servers.vllm_server_actor.VLLMServerActor object at 0x7ed25ead5970>) File "/home/ray/anaconda3/lib/python3.12/concurrent/futures/_base.py", line 449, in result return self.__get_result() ^^^^^^^^^^^^^^^^^^^ File "/home/ray/anaconda3/lib/python3.12/concurrent/futures/_base.py", line 401, in __get_result raise self._exception ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ File "/tmp/ray/session_2026-06-14_18-52-12_482543_634/runtime_resources/working_dir_files/s3_anyscale-k8s-sky-anyscale-aws-us-east-1-e88d5dbd_org_xc6lv84h3d7m9dljcc17esfw2i_cld_hxkifz7xa22mwicp21nzkds1lw_runtime_env_packages_pkg_991c7596ffe950b0d97128cdcfd34c93/skyrl/backends/skyrl_train/inference_servers/vllm_server_actor.py", line 279, in start await self._wait_until_healthy() ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ File "/tmp/ray/session_2026-06-14_18-52-12_482543_634/runtime_resources/working_dir_files/s3_anyscale-k8s-sky-anyscale-aws-us-east-1-e88d5dbd_org_xc6lv84h3d7m9dljcc17esfw2i_cld_hxkifz7xa22mwicp21nzkds1lw_runtime_env_packages_pkg_991c7596ffe950b0d97128cdcfd34c93/skyrl/backends/skyrl_train/inference_servers/vllm_server_actor.py", line 294, in _wait_until_healthy raise exc ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ File "/tmp/ray/session_2026-06-14_18-52-12_482543_634/runtime_resources/working_dir_files/s3_anyscale-k8s-sky-anyscale-aws-us-east-1-e88d5dbd_org_xc6lv84h3d7m9dljcc17esfw2i_cld_hxkifz7xa22mwicp21nzkds1lw_runtime_env_packages_pkg_991c7596ffe950b0d97128cdcfd34c93/skyrl/backends/skyrl_train/inference_servers/vllm_server_actor.py", line 335, in _run_server self._engine = AsyncLLMEngine.from_engine_args( ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ File "/home/ray/.cache/uv/builds-v0/.tmpVdttC1/lib/python3.12/site-packages/vllm/v1/engine/async_llm.py", line 246, in from_engine_args return cls( ^^^^ File "/home/ray/.cache/uv/builds-v0/.tmpVdttC1/lib/python3.12/site-packages/vllm/v1/engine/async_llm.py", line 146, in __init__ self.engine_core = EngineCoreClient.make_async_mp_client( ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ File "/home/ray/.cache/uv/builds-v0/.tmpVdttC1/lib/python3.12/site-packages/vllm/tracing/otel.py", line 178, in sync_wrapper return func(*args, **kwargs) ^^^^^^^^^^^^^^^^^^^^^ File "/home/ray/.cache/uv/builds-v0/.tmpVdttC1/lib/python3.12/site-packages/vllm/v1/engine/core_client.py", line 130, in make_async_mp_client return AsyncMPClient(*client_args) ^^^^^^^^^^^^^^^^^^^^^^^^^^^ File "/home/ray/.cache/uv/builds-v0/.tmpVdttC1/lib/python3.12/site-packages/vllm/tracing/otel.py", line 178, in sync_wrapper return func(*args, **kwargs) ^^^^^^^^^^^^^^^^^^^^^ File "/home/ray/.cache/uv/builds-v0/.tmpVdttC1/lib/python3.12/site-packages/vllm/v1/engine/core_client.py", line 900, in __init__ super().__init__( File "/home/ray/.cache/uv/builds-v0/.tmpVdttC1/lib/python3.12/site-packages/vllm/v1/engine/core_client.py", line 535, in __init__ with launch_core_engines( ^^^^^^^^^^^^^^^^^^^^ File "/home/ray/anaconda3/lib/python3.12/contextlib.py", line 144, in __exit__ next(self.gen) File "/home/ray/.cache/uv/builds-v0/.tmpVdttC1/lib/python3.12/site-packages/vllm/v1/engine/utils.py", line 1119, in launch_core_engines wait_for_engine_startup( File "/home/ray/.cache/uv/builds-v0/.tmpVdttC1/lib/python3.12/site-packages/vllm/v1/engine/utils.py", line 1178, in wait_for_engine_startup raise RuntimeError( RuntimeError: Engine core initialization failed. See root cause above. Failed core proc(s): {} FAILED tests/backends/skyrl_train/gpu/gpu_ci/test_engine_generation.py::test_pd_generation_non_colocated[1P1D_non_colocated] - ray.exceptions.RayTaskError(RuntimeError): ray::VLLMServerActor.start() (pid=62434, ip=10.1.137.178, actor_id=f66e78b0c51b929ba4940ac52b000000, repr=<skyrl.backends.skyrl_train.inference_servers.vllm_server_actor.VLLMServerActor object at 0x7fa02f4e4950>) File "/home/ray/anaconda3/lib/python3.12/concurrent/futures/_base.py", line 449, in result return self.__get_result() ^^^^^^^^^^^^^^^^^^^ File "/home/ray/anaconda3/lib/python3.12/concurrent/futures/_base.py", line 401, in __get_result raise self._exception ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ File "/tmp/ray/session_2026-06-14_18-52-12_482543_634/runtime_resources/working_dir_files/s3_anyscale-k8s-sky-anyscale-aws-us-east-1-e88d5dbd_org_xc6lv84h3d7m9dljcc17esfw2i_cld_hxkifz7xa22mwicp21nzkds1lw_runtime_env_packages_pkg_991c7596ffe950b0d97128cdcfd34c93/skyrl/backends/skyrl_train/inference_servers/vllm_server_actor.py", line 279, in start await self._wait_until_healthy() ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ File "/tmp/ray/session_2026-06-14_18-52-12_482543_634/runtime_resources/working_dir_files/s3_anyscale-k8s-sky-anyscale-aws-us-east-1-e88d5dbd_org_xc6lv84h3d7m9dljcc17esfw2i_cld_hxkifz7xa22mwicp21nzkds1lw_runtime_env_packages_pkg_991c7596ffe950b0d97128cdcfd34c93/skyrl/backends/skyrl_train/inference_servers/vllm_server_actor.py", line 294, in _wait_until_healthy raise exc ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ File "/tmp/ray/session_2026-06-14_18-52-12_482543_634/runtime_resources/working_dir_files/s3_anyscale-k8s-sky-anyscale-aws-us-east-1-e88d5dbd_org_xc6lv84h3d7m9dljcc17esfw2i_cld_hxkifz7xa22mwicp21nzkds1lw_runtime_env_packages_pkg_991c7596ffe950b0d97128cdcfd34c93/skyrl/backends/skyrl_train/inference_servers/vllm_server_actor.py", line 335, in _run_server self._engine = AsyncLLMEngine.from_engine_args( ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ File "/home/ray/.cache/uv/builds-v0/.tmp2XwpZ8/lib/python3.12/site-packages/vllm/v1/engine/async_llm.py", line 246, in from_engine_args return cls( ^^^^ File "/home/ray/.cache/uv/builds-v0/.tmp2XwpZ8/lib/python3.12/site-packages/vllm/v1/engine/async_llm.py", line 146, in __init__ self.engine_core = EngineCoreClient.make_async_mp_client( ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ File "/home/ray/.cache/uv/builds-v0/.tmp2XwpZ8/lib/python3.12/site-packages/vllm/tracing/otel.py", line 178, in sync_wrapper return func(*args, **kwargs) ^^^^^^^^^^^^^^^^^^^^^ File "/home/ray/.cache/uv/builds-v0/.tmp2XwpZ8/lib/python3.12/site-packages/vllm/v1/engine/core_client.py", line 130, in make_async_mp_client return AsyncMPClient(*client_args) ^^^^^^^^^^^^^^^^^^^^^^^^^^^ File "/home/ray/.cache/uv/builds-v0/.tmp2XwpZ8/lib/python3.12/site-packages/vllm/tracing/otel.py", line 178, in sync_wrapper return func(*args, **kwargs) ^^^^^^^^^^^^^^^^^^^^^ File "/home/ray/.cache/uv/builds-v0/.tmp2XwpZ8/lib/python3.12/site-packages/vllm/v1/engine/core_client.py", line 900, in __init__ super().__init__( File "/home/ray/.cache/uv/builds-v0/.tmp2XwpZ8/lib/python3.12/site-packages/vllm/v1/engine/core_client.py", line 535, in __init__ with launch_core_engines( ^^^^^^^^^^^^^^^^^^^^ File "/home/ray/anaconda3/lib/python3.12/contextlib.py", line 144, in __exit__ next(self.gen) File "/home/ray/.cache/uv/builds-v0/.tmp2XwpZ8/lib/python3.12/site-packages/vllm/v1/engine/utils.py", line 1119, in launch_core_engines wait_for_engine_startup( File "/home/ray/.cache/uv/builds-v0/.tmp2XwpZ8/lib/python3.12/site-packages/vllm/v1/engine/utils.py", line 1178, in wait_for_engine_startup raise RuntimeError( RuntimeError: Engine core initialization failed. See root cause above. Failed core proc(s): {} FAILED tests/backends/skyrl_train/gpu/gpu_ci/test_skyrl_gym_generator.py::test_generator_formatting_no_use_conversation_multi_turn[unsloth/Llama-3.2-1B-Instruct] - assert 0 == 1 + where 0 = sum(<generator object test_generator_formatting_no_use_conversation_multi_turn.<locals>.<genexpr> at 0x7f8f014d7b90>) ERROR tests/backends/skyrl_train/gpu/gpu_ci/inference_servers/test_inference_server_group.py::TestServerGroupAndRouter::test_pause_resume - RuntimeError: There is no current event loop in thread 'MainThread'. ERROR tests/backends/skyrl_train/gpu/gpu_ci/inference_servers/test_weight_sync.py::TestWeightUpdateFlow::test_update_weights_flow[pd_1P1D_non_colocated] - ray.exceptions.RayTaskError(RuntimeError): ray::VLLMServerActor.start() (pid=37642, ip=10.1.137.178, actor_id=c5ef1351b990537fa794fc7c15000000, repr=<skyrl.backends.skyrl_train.inference_servers.vllm_server_actor.VLLMServerActor object at 0x7f8c35395820>) File "/home/ray/anaconda3/lib/python3.12/concurrent/futures/_base.py", line 449, in result return self.__get_result() ^^^^^^^^^^^^^^^^^^^ File "/home/ray/anaconda3/lib/python3.12/concurrent/futures/_base.py", line 401, in __get_result raise self._exception ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ File "/tmp/ray/session_2026-06-14_18-52-12_482543_634/runtime_resources/working_dir_files/s3_anyscale-k8s-sky-anyscale-aws-us-east-1-e88d5dbd_org_xc6lv84h3d7m9dljcc17esfw2i_cld_hxkifz7xa22mwicp21nzkds1lw_runtime_env_packages_pkg_991c7596ffe950b0d97128cdcfd34c93/skyrl/backends/skyrl_train/inference_servers/vllm_server_actor.py", line 279, in start await self._wait_until_healthy() ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ File "/tmp/ray/session_2026-06-14_18-52-12_482543_634/runtime_resources/working_dir_files/s3_anyscale-k8s-sky-anyscale-aws-us-east-1-e88d5dbd_org_xc6lv84h3d7m9dljcc17esfw2i_cld_hxkifz7xa22mwicp21nzkds1lw_runtime_env_packages_pkg_991c7596ffe950b0d97128cdcfd34c93/skyrl/backends/skyrl_train/inference_servers/vllm_server_actor.py", line 294, in _wait_until_healthy raise exc ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ File "/tmp/ray/session_2026-06-14_18-52-12_482543_634/runtime_resources/working_dir_files/s3_anyscale-k8s-sky-anyscale-aws-us-east-1-e88d5dbd_org_xc6lv84h3d7m9dljcc17esfw2i_cld_hxkifz7xa22mwicp21nzkds1lw_runtime_env_packages_pkg_991c7596ffe950b0d97128cdcfd34c93/skyrl/backends/skyrl_train/inference_servers/vllm_server_actor.py", line 335, in _run_server self._engine = AsyncLLMEngine.from_engine_args( ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ File "/home/ray/.cache/uv/builds-v0/.tmpg2XhWJ/lib/python3.12/site-packages/vllm/v1/engine/async_llm.py", line 246, in from_engine_args return cls( ^^^^ File "/home/ray/.cache/uv/builds-v0/.tmpg2XhWJ/lib/python3.12/site-packages/vllm/v1/engine/async_llm.py", line 146, in __init__ self.engine_core = EngineCoreClient.make_async_mp_client( ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ File "/home/ray/.cache/uv/builds-v0/.tmpg2XhWJ/lib/python3.12/site-packages/vllm/tracing/otel.py", line 178, in sync_wrapper return func(*args, **kwargs) ^^^^^^^^^^^^^^^^^^^^^ File "/home/ray/.cache/uv/builds-v0/.tmpg2XhWJ/lib/python3.12/site-packages/vllm/v1/engine/core_client.py", line 130, in make_async_mp_client return AsyncMPClient(*client_args) ^^^^^^^^^^^^^^^^^^^^^^^^^^^ File "/home/ray/.cache/uv/builds-v0/.tmpg2XhWJ/lib/python3.12/site-packages/vllm/tracing/otel.py", line 178, in sync_wrapper return func(*args, **kwargs) ^^^^^^^^^^^^^^^^^^^^^ File "/home/ray/.cache/uv/builds-v0/.tmpg2XhWJ/lib/python3.12/site-packages/vllm/v1/engine/core_client.py", line 900, in __init__ super().__init__( File "/home/ray/.cache/uv/builds-v0/.tmpg2XhWJ/lib/python3.12/site-packages/vllm/v1/engine/core_client.py", line 535, in __init__ with launch_core_engines( ^^^^^^^^^^^^^^^^^^^^ File "/home/ray/anaconda3/lib/python3.12/contextlib.py", line 144, in __exit__ next(self.gen) File "/home/ray/.cache/uv/builds-v0/.tmpg2XhWJ/lib/python3.12/site-packages/vllm/v1/engine/utils.py", line 1119, in launch_core_engines wait_for_engine_startup( File "/home/ray/.cache/uv/builds-v0/.tmpg2XhWJ/lib/python3.12/site-packages/vllm/v1/engine/utils.py", line 1178, in wait_for_engine_startup raise RuntimeError( RuntimeError: Engine core initialization failed. See root cause above. Failed core proc(s): {} = 6 failed, 98 passed, 20 skipped, 23 deselected, 575 warnings, 2 errors in 7228.81s (2:00:28) = *** SIGTERM received at time=1781470490 on cpu 163 *** PC: @ 0x7f97393a48fb (unknown) syscall @ 0x7f958e5baa89 (unknown) signal_hook_registry::handler::h7f516a97b139a5d0 {"asctime":"2026-06-14 20:54:50,465","levelname":"E","message":"*** SIGTERM received at time=1781470490 on cpu 163 ***","filename":"logging.cc","lineno":474} {"asctime":"2026-06-14 20:54:50,465","levelname":"E","message":"PC: @ 0x7f97393a48fb (unknown) syscall","filename":"logging.cc","lineno":474} {"asctime":"2026-06-14 20:54:50,465","levelname":"E","message":" @ 0x7f958e5baa89 (unknown) signal_hook_registry::handler::h7f516a97b139a5d0","filename":"logging.cc","lineno":474} 2026-06-14 20:54:50 INFO vllm_router_rs::server: src/server.rs:1080: Received terminate signal, starting graceful shutdown 2026-06-14 20:54:50 WARN vllm_router_rs::middleware: src/middleware.rs:461: Concurrency queue processor shutting down sys:1: DeprecationWarning: builtin type swigvarlink has no __module__ attribute ``` </details> - GPU CI run [after](https://console.anyscale.com/cld_hxkifz7xa22mwicp21nzkds1lw/prj_4b6c498rypyq6g7yhk6vzgjevt/jobs/prodjob_m71sz4nf9jc1qig4xecgrrt2xq?job-tab=overview&job-logs-section-tabs=application_logs) <details> <summary>Failures</summary> ```bash =========================== short test summary info ============================ FAILED tests/backends/skyrl_train/gpu/gpu_ci/inference_servers/test_weight_sync_moe.py::test_worker_wrap_load_weights_preserves_moe_forward - RuntimeError: Engine core initialization failed. See root cause above. Failed core proc(s): {} ====== 1 failed, 33 passed, 6 skipped, 108 warnings in 1251.71s (0:20:51) ====== sys:1: DeprecationWarning: builtin type swigvarlink has no __module__ attribute ``` </details> --------- Signed-off-by: SumanthRH <sumanthrh99@gmail.com>
1 parent 5a527df commit c4eaded

3 files changed

Lines changed: 42 additions & 13 deletions

File tree

pyproject.toml

Lines changed: 19 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -97,14 +97,22 @@ skyrl-train = [
9797
"vllm-router; sys_platform == 'linux'",
9898
"pybind11",
9999
"setuptools",
100-
"nixl-cu12; sys_platform == 'linux'",
100+
# The `nixl` shim provides that namespace and dispatches to `nixl_cu12`.
101+
# `nixl-cu12` ships the `nixl_cu12` module, but vLLM imports `nixl._api`.
102+
# Its metadata hard-depends on `nixl-cu13` too; that variant is overridden
103+
# out below (it would drag in the CUDA-13 stack and break the cu12 torch pin).
104+
"nixl; sys_platform == 'linux'",
101105
]
102106

103107
fsdp = [
104108
"skyrl[skyrl-train]",
105109
"vllm==0.20.2; sys_platform == 'linux'",
106110
"vllm-router; sys_platform == 'linux'",
107-
"nixl-cu12; sys_platform == 'linux'",
111+
# The `nixl` shim provides that namespace and dispatches to `nixl_cu12`.
112+
# `nixl-cu12` ships the `nixl_cu12` module, but vLLM imports `nixl._api`.
113+
# Its metadata hard-depends on `nixl-cu13` too; that variant is overridden
114+
# out below (it would drag in the CUDA-13 stack and break the cu12 torch pin).
115+
"nixl; sys_platform == 'linux'",
108116
"flash-linear-attention; sys_platform == 'linux'",
109117
"causal-conv1d; sys_platform == 'linux'",
110118
"flash-attn==2.8.3; sys_platform == 'linux'",
@@ -124,7 +132,11 @@ megatron = [
124132
"mamba-ssm>=2.3.0; sys_platform == 'linux'",
125133
"vllm==0.20.2; sys_platform == 'linux'",
126134
"vllm-router; sys_platform == 'linux'",
127-
"nixl-cu12; sys_platform == 'linux'",
135+
# The `nixl` shim provides that namespace and dispatches to `nixl_cu12`.
136+
# `nixl-cu12` ships the `nixl_cu12` module, but vLLM imports `nixl._api`.
137+
# Its metadata hard-depends on `nixl-cu13` too; that variant is overridden
138+
# out below (it would drag in the CUDA-13 stack and break the cu12 torch pin).
139+
"nixl; sys_platform == 'linux'",
128140
"torch==2.11.0; sys_platform == 'linux'",
129141
"flashinfer-python==0.6.8.post1; sys_platform == 'linux' and platform_machine == 'x86_64'",
130142
"torchvision; sys_platform == 'linux'",
@@ -224,7 +236,10 @@ override-dependencies = [
224236
"transformers>=5.6.1,<=5.8.0; sys_platform == 'linux'",
225237
"megatron-core>=0.16.0; sys_platform == 'linux' and python_version >= '3.12'",
226238
"ml_dtypes>=0.5.0; sys_platform == 'linux'",
227-
"transformer-engine-cu13; sys_platform == 'never'"
239+
"transformer-engine-cu13; sys_platform == 'never'",
240+
# `nixl` hard-depends on both nixl-cu12 and nixl-cu13; drop the cu13 variant
241+
# so it doesn't pull the CUDA-13 stack and bump torch off the cu12 pin.
242+
"nixl-cu13; sys_platform == 'never'"
228243
]
229244

230245
[tool.uv.extra-build-dependencies]

tests/backends/skyrl_train/gpu/gpu_ci/inference_servers/test_inference_server_group.py

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -118,7 +118,9 @@ def server_group_and_router(class_scoped_ray_init_fixture):
118118
"client": client,
119119
}
120120

121-
asyncio.get_event_loop().run_until_complete(client.teardown())
121+
# Teardown runs after the class-scoped event loop is gone, so create a
122+
# fresh one. `asyncio.get_event_loop()` raises on py3.12 when no loop is set.
123+
asyncio.run(client.teardown())
122124
router.shutdown()
123125
group.shutdown()
124126

uv.lock

Lines changed: 20 additions & 8 deletions
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

0 commit comments

Comments
 (0)