Skip to content

Commit d183676

Browse files
feat: add llmisvcs for multinode dp+ep, singlenode p/d
1 parent b648be2 commit d183676

File tree

4 files changed

+316
-35
lines changed

4 files changed

+316
-35
lines changed

tests/model_serving/model_server/llmd/conftest.py

Lines changed: 199 additions & 35 deletions
Original file line numberDiff line numberDiff line change
@@ -260,61 +260,60 @@ def llmd_inference_service_gpu(
260260

261261

262262
@pytest.fixture(scope="class")
263-
def deepseek_r1_inference_service(
263+
def llmisvc_multinode_dp_ep(
264+
request: FixtureRequest,
264265
admin_client: DynamicClient,
265266
unprivileged_model_namespace: Namespace,
266267
) -> Generator[LLMInferenceService, None, None]:
267-
"""Fixture for DeepSeek R1 0528 model with multi-node configuration."""
268-
service_name = "deepseek-r1-0528"
268+
"""Fixture for DeepSeek Coder V2 model with multi-node configuration optimized for GCP."""
269+
# Extract parameters from pytest.mark.parametrize or use defaults
270+
params = getattr(request, "param", {})
271+
if not isinstance(params, dict):
272+
params = {}
273+
274+
service_name = params.get("service_name", "deepseek-coder-v2")
275+
storage_uri = params.get("storage_uri", "hf://deepseek-ai/DeepSeek-Coder-V2-Lite-Instruct")
276+
model_name = params.get("model_name", "deepseek-ai/DeepSeek-Coder-V2-Lite-Instruct")
269277

270278
# Define common environment variables for both template and worker
271279
common_env = [
280+
{"name": "VLLM_API_SERVER_COUNT", "value": "1"},
272281
{"name": "VLLM_LOGGING_LEVEL", "value": "INFO"},
273-
{"name": "KSERVE_INFER_ROCE", "value": "true"},
274282
{"name": "CUDA_DEVICE_ORDER", "value": "PCI_BUS_ID"},
275-
# Memory optimizations
276283
{"name": "VLLM_ADDITIONAL_ARGS", "value": "--gpu-memory-utilization 0.95 --max-model-len 8192 --enforce-eager"},
277-
{"name": "VLLM_ALL2ALL_BACKEND", "value": "deepep_high_throughput"},
284+
{"name": "VLLM_ALL2ALL_BACKEND", "value": "naive"},
278285
{"name": "PYTORCH_CUDA_ALLOC_CONF", "value": "expandable_segments:True"},
279-
# Essential NCCL configuration
280-
{"name": "NCCL_IB_GID_INDEX", "value": "3"},
286+
{"name": "NCCL_IB_DISABLE", "value": "1"},
287+
{"name": "NCCL_NET_GDR_LEVEL", "value": "0"},
288+
{"name": "NCCL_P2P_LEVEL", "value": "NVL"},
289+
{"name": "NCCL_SOCKET_IFNAME", "value": "eth0"},
290+
{"name": "NCCL_NSOCKS_PERTHREAD", "value": "2"},
291+
{"name": "NCCL_SOCKET_NTHREADS", "value": "2"},
292+
{"name": "NCCL_BUFFSIZE", "value": "2097152"},
281293
{"name": "NCCL_DEBUG", "value": "WARN"},
282-
{"name": "NCCL_SOCKET_IFNAME", "value": "net1"},
283-
{"name": "NCCL_IB_TIMEOUT", "value": "100"},
284-
# NVSHMEM configuration - optimized for stability
285-
{"name": "NVSHMEM_REMOTE_TRANSPORT", "value": "ibgda"},
294+
{"name": "NVSHMEM_REMOTE_TRANSPORT", "value": "ucx"},
295+
{"name": "NVSHMEM_DISABLE_CUDA_VMM", "value": "0"},
286296
{"name": "NVSHMEM_BOOTSTRAP_TWO_STAGE", "value": "1"},
287297
{"name": "NVSHMEM_BOOTSTRAP_TIMEOUT", "value": "300"},
288-
{"name": "NVSHMEM_BOOTSTRAP_UID_SOCK_IFNAME", "value": "net1"},
289-
{"name": "NVSHMEM_IB_GID_INDEX", "value": "3"},
290-
{"name": "NVSHMEM_USE_IBGDA", "value": "1"},
291-
{"name": "NVSHMEM_ENABLE_NIC_PE_MAPPING", "value": "1"},
292-
{"name": "NVSHMEM_IBGDA_SUPPORT", "value": "1"},
293-
{"name": "NVSHMEM_IB_ENABLE_IBGDA", "value": "1"},
294-
{"name": "NVSHMEM_IBGDA_NIC_HANDLER", "value": "gpu"},
295-
{"name": "NVSHMEM_DEBUG", "value": "WARN"},
296-
# UCX configuration for NVSHMEM
297-
{"name": "UCX_TLS", "value": "rc,sm,self,cuda_copy,cuda_ipc"},
298-
{"name": "UCX_IB_GID_INDEX", "value": "3"},
299-
{"name": "UCX_RC_MLX5_TM_ENABLE", "value": "n"},
300-
{"name": "UCX_UD_MLX5_RX_QUEUE_LEN", "value": "1024"},
298+
{"name": "NVSHMEM_BOOTSTRAP_UID_SOCK_IFNAME", "value": "eth0"},
299+
{"name": "NVSHMEM_DEBUG", "value": "INFO"},
300+
{"name": "UCX_TLS", "value": "tcp,sm,self,cuda_copy,cuda_ipc"},
301+
{"name": "UCX_NET_DEVICES", "value": "eth0"},
301302
{"name": "NVIDIA_GDRCOPY", "value": "enabled"},
302303
]
303304

304305
container_resources = {
305306
"limits": {
306307
"cpu": "128",
307-
"ephemeral-storage": "800Gi",
308+
"ephemeral-storage": "100Gi",
308309
"memory": "512Gi",
309310
"nvidia.com/gpu": "8",
310-
"rdma/roce_gdr": "1",
311311
},
312312
"requests": {
313313
"cpu": "64",
314-
"ephemeral-storage": "800Gi",
314+
"ephemeral-storage": "100Gi",
315315
"memory": "256Gi",
316316
"nvidia.com/gpu": "8",
317-
"rdma/roce_gdr": "1",
318317
},
319318
}
320319

@@ -327,7 +326,7 @@ def deepseek_r1_inference_service(
327326
}
328327

329328
parallelism_config = {
330-
"data": 32,
329+
"data": 16,
331330
"dataLocal": 8,
332331
"expert": True,
333332
"tensor": 1,
@@ -340,7 +339,6 @@ def deepseek_r1_inference_service(
340339
}
341340

342341
worker_spec = {
343-
"serviceAccountName": "hfsa",
344342
"containers": [
345343
{
346344
"name": "main",
@@ -351,25 +349,191 @@ def deepseek_r1_inference_service(
351349
}
352350

353351
annotations = {
354-
"k8s.v1.cni.cncf.io/networks": "roce-p2",
352+
"security.opendatahub.io/enable-network-policies": "false",
355353
}
356354

357355
with create_llmisvc(
358356
client=admin_client,
359357
name=service_name,
360358
namespace=unprivileged_model_namespace.name,
361-
storage_uri=ModelStorage.HF_DEEPSEEK_R1_0528,
362-
model_name="deepseek-ai/DeepSeek-R1-0528",
359+
storage_uri=storage_uri,
360+
model_name=model_name,
363361
replicas=1,
364362
parallelism=parallelism_config,
365363
router_config=router_config,
366364
container_env=common_env,
367365
container_resources=container_resources,
368366
liveness_probe=liveness_probe,
369-
service_account="hfsa",
370367
worker_config=worker_spec,
371368
annotations=annotations,
372369
wait=True,
373370
timeout=Timeout.TIMEOUT_30MIN,
374371
) as llm_service:
375372
yield llm_service
373+
374+
375+
@pytest.fixture(scope="class")
376+
def llmisvc_singlenode_prefill_decode(
377+
request: FixtureRequest,
378+
admin_client: DynamicClient,
379+
unprivileged_model_namespace: Namespace,
380+
) -> Generator[LLMInferenceService, None, None]:
381+
"""Fixture for single-node GPU LLMInferenceService with prefill-decode separation."""
382+
# Extract parameters from pytest.mark.parametrize or use defaults
383+
params = getattr(request, "param", {})
384+
if not isinstance(params, dict):
385+
params = {}
386+
387+
service_name = params.get("service_name", "qwen2-7b-instruct-pd")
388+
storage_uri = params.get("storage_uri", "hf://Qwen/Qwen2.5-7B-Instruct")
389+
model_name = params.get("model_name", "Qwen/Qwen2.5-7B-Instruct")
390+
decode_replicas = params.get("decode_replicas", 1)
391+
prefill_replicas = params.get("prefill_replicas", 2)
392+
393+
# Common environment variables for both prefill and decode (template)
394+
common_env = [
395+
# Enable RDMA for KV cache transfer
396+
{"name": "KSERVE_INFER_ROCE", "value": "true"},
397+
# Pod IP for KV transfer side channel
398+
{
399+
"name": "VLLM_NIXL_SIDE_CHANNEL_HOST",
400+
"valueFrom": {"fieldRef": {"fieldPath": "status.podIP"}},
401+
},
402+
# Enable KV cache transfer via NixlConnector (RDMA-based)
403+
{
404+
"name": "VLLM_ADDITIONAL_ARGS",
405+
"value": '--kv_transfer_config \'{"kv_connector":"NixlConnector","kv_role":"kv_both"}\'',
406+
},
407+
# UCX configuration for RDMA transport
408+
{"name": "UCX_PROTO_INFO", "value": "y"},
409+
{"name": "UCX_TLS", "value": "rc,sm,self,cuda_copy,cuda_ipc"},
410+
]
411+
412+
container_resources = {
413+
"limits": {
414+
"cpu": "4",
415+
"memory": "32Gi",
416+
"nvidia.com/gpu": "1",
417+
"rdma/roce_gdr": "1",
418+
},
419+
"requests": {
420+
"cpu": "2",
421+
"memory": "16Gi",
422+
"nvidia.com/gpu": "1",
423+
"rdma/roce_gdr": "1",
424+
},
425+
}
426+
427+
liveness_probe = {
428+
"httpGet": {"path": "/health", "port": 8000, "scheme": "HTTPS"},
429+
"initialDelaySeconds": 120,
430+
"periodSeconds": 30,
431+
"timeoutSeconds": 30,
432+
"failureThreshold": 5,
433+
}
434+
435+
# Scheduler config text for prefill-decode separation
436+
scheduler_config_text = """apiVersion: inference.networking.x-k8s.io/v1alpha1
437+
kind: EndpointPickerConfig
438+
plugins:
439+
- type: prefill-header-handler
440+
- type: prefill-filter
441+
- type: decode-filter
442+
- type: max-score-picker
443+
- type: queue-scorer
444+
parameters:
445+
hashBlockSize: 5
446+
maxPrefixBlocksToMatch: 256
447+
lruCapacityPerServer: 31250
448+
- type: pd-profile-handler
449+
parameters:
450+
threshold: 0
451+
hashBlockSize: 5
452+
schedulingProfiles:
453+
- name: prefill
454+
plugins:
455+
- pluginRef: prefill-filter
456+
- pluginRef: queue-scorer
457+
weight: 1.0
458+
- pluginRef: max-score-picker
459+
- name: decode
460+
plugins:
461+
- pluginRef: decode-filter
462+
- pluginRef: queue-scorer
463+
weight: 1.0
464+
- pluginRef: max-score-picker
465+
"""
466+
467+
# Router config with scheduler configuration for prefill-decode separation
468+
router_config = {
469+
"route": {},
470+
"gateway": {},
471+
"scheduler": {
472+
"template": {
473+
"containers": [
474+
{
475+
"name": "main",
476+
"args": [
477+
"--pool-name",
478+
"{{ ChildName .ObjectMeta.Name `-inference-pool` }}",
479+
"--pool-namespace",
480+
"{{ .ObjectMeta.Namespace }}",
481+
"--zap-encoder",
482+
"json",
483+
"--grpc-port",
484+
"9002",
485+
"--grpc-health-port",
486+
"9003",
487+
"--secure-serving",
488+
"--model-server-metrics-scheme",
489+
"https",
490+
"--model-server-metrics-https-insecure-skip-verify",
491+
"--cert-path",
492+
"/etc/ssl/certs",
493+
"--config-text",
494+
scheduler_config_text,
495+
],
496+
}
497+
]
498+
}
499+
},
500+
}
501+
502+
# Prefill configuration
503+
prefill_config = {
504+
"replicas": prefill_replicas,
505+
"template": {
506+
"containers": [
507+
{
508+
"name": "main",
509+
"env": common_env,
510+
"resources": container_resources,
511+
"livenessProbe": liveness_probe,
512+
}
513+
]
514+
},
515+
}
516+
517+
# Annotations for RoCE network
518+
annotations = {
519+
# RoCE network required for KV cache transfer via RDMA
520+
"k8s.v1.cni.cncf.io/networks": "roce-p2",
521+
}
522+
523+
with create_llmisvc(
524+
client=admin_client,
525+
name=service_name,
526+
namespace=unprivileged_model_namespace.name,
527+
storage_uri=storage_uri,
528+
model_name=model_name,
529+
replicas=decode_replicas,
530+
router_config=router_config,
531+
container_env=common_env,
532+
container_resources=container_resources,
533+
liveness_probe=liveness_probe,
534+
prefill_config=prefill_config,
535+
annotations=annotations,
536+
wait=True,
537+
timeout=Timeout.TIMEOUT_30MIN,
538+
) as llm_service:
539+
yield llm_service
Lines changed: 45 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,45 @@
1+
import pytest
2+
3+
from tests.model_serving.model_server.llmd.utils import (
4+
verify_gateway_status,
5+
verify_llm_service_status,
6+
verify_llmd_no_failed_pods,
7+
)
8+
from utilities.constants import Protocols
9+
from utilities.llmd_utils import verify_inference_response_llmd
10+
from utilities.manifests.deepseek_coder_v2_lite import DEEPSEEK_CODER_V2_INFERENCE_CONFIG
11+
12+
pytestmark = [
13+
pytest.mark.llmd_gpu,
14+
pytest.mark.gpu,
15+
pytest.mark.model_server_gpu,
16+
]
17+
18+
19+
@pytest.mark.parametrize(
20+
"unprivileged_model_namespace",
21+
[pytest.param({"name": "llmd-multinode-test"})],
22+
indirect=True,
23+
)
24+
class TestMultiNodeLLMISVC:
25+
"""Multi Node LLMISVC test cases."""
26+
27+
def test_dp_ep(self, unprivileged_client, llmd_gateway, llmisvc_multinode_dp_ep):
28+
"""Test multi node llmisvc with DP + EP."""
29+
30+
llmisvc = llmisvc_multinode_dp_ep
31+
32+
assert verify_gateway_status(llmd_gateway), "Gateway should be ready"
33+
assert verify_llm_service_status(llmisvc), "LLMInferenceService should be ready"
34+
35+
verify_inference_response_llmd(
36+
llm_service=llmisvc,
37+
inference_config=DEEPSEEK_CODER_V2_INFERENCE_CONFIG,
38+
inference_type="completions",
39+
protocol=Protocols.HTTPS,
40+
use_default_query=True,
41+
insecure=True,
42+
model_name=llmisvc.name,
43+
)
44+
45+
verify_llmd_no_failed_pods(client=unprivileged_client, llm_service=llmisvc)
Lines changed: 45 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,45 @@
1+
import pytest
2+
3+
from tests.model_serving.model_server.llmd.utils import (
4+
verify_gateway_status,
5+
verify_llm_service_status,
6+
verify_llmd_no_failed_pods,
7+
)
8+
from utilities.constants import Protocols
9+
from utilities.llmd_utils import verify_inference_response_llmd
10+
from utilities.manifests.deepseek_coder_v2_lite import DEEPSEEK_CODER_V2_INFERENCE_CONFIG
11+
12+
pytestmark = [
13+
pytest.mark.llmd_gpu,
14+
pytest.mark.gpu,
15+
pytest.mark.model_server_gpu,
16+
]
17+
18+
19+
@pytest.mark.parametrize(
20+
"unprivileged_model_namespace",
21+
[pytest.param({"name": "llmd-multinode-test"})],
22+
indirect=True,
23+
)
24+
class TestSingleNodePrefillDecode:
25+
"""Multi Node LLMISVC test cases."""
26+
27+
def test_prefill_decode(self, unprivileged_client, llmd_gateway, llmisvc_singlenode_prefill_decode):
28+
"""Test multi node llmisvc with DP + EP."""
29+
30+
llmisvc = llmisvc_singlenode_prefill_decode
31+
32+
assert verify_gateway_status(llmd_gateway), "Gateway should be ready"
33+
assert verify_llm_service_status(llmisvc), "LLMInferenceService should be ready"
34+
35+
verify_inference_response_llmd(
36+
llm_service=llmisvc,
37+
inference_config=DEEPSEEK_CODER_V2_INFERENCE_CONFIG,
38+
inference_type="completions",
39+
protocol=Protocols.HTTPS,
40+
use_default_query=True,
41+
insecure=True,
42+
model_name=llmisvc.name,
43+
)
44+
45+
verify_llmd_no_failed_pods(client=unprivileged_client, llm_service=llmisvc)

0 commit comments

Comments
 (0)