@@ -260,61 +260,60 @@ def llmd_inference_service_gpu(
260260
261261
262262@pytest .fixture (scope = "class" )
263- def deepseek_r1_inference_service (
263+ def llmisvc_multinode_dp_ep (
264+ request : FixtureRequest ,
264265 admin_client : DynamicClient ,
265266 unprivileged_model_namespace : Namespace ,
266267) -> Generator [LLMInferenceService , None , None ]:
267- """Fixture for DeepSeek R1 0528 model with multi-node configuration."""
268- service_name = "deepseek-r1-0528"
268+ """Fixture for DeepSeek Coder V2 model with multi-node configuration optimized for GCP."""
269+ # Extract parameters from pytest.mark.parametrize or use defaults
270+ params = getattr (request , "param" , {})
271+ if not isinstance (params , dict ):
272+ params = {}
273+
274+ service_name = params .get ("service_name" , "deepseek-coder-v2" )
275+ storage_uri = params .get ("storage_uri" , "hf://deepseek-ai/DeepSeek-Coder-V2-Lite-Instruct" )
276+ model_name = params .get ("model_name" , "deepseek-ai/DeepSeek-Coder-V2-Lite-Instruct" )
269277
270278 # Define common environment variables for both template and worker
271279 common_env = [
280+ {"name" : "VLLM_API_SERVER_COUNT" , "value" : "1" },
272281 {"name" : "VLLM_LOGGING_LEVEL" , "value" : "INFO" },
273- {"name" : "KSERVE_INFER_ROCE" , "value" : "true" },
274282 {"name" : "CUDA_DEVICE_ORDER" , "value" : "PCI_BUS_ID" },
275- # Memory optimizations
276283 {"name" : "VLLM_ADDITIONAL_ARGS" , "value" : "--gpu-memory-utilization 0.95 --max-model-len 8192 --enforce-eager" },
277- {"name" : "VLLM_ALL2ALL_BACKEND" , "value" : "deepep_high_throughput " },
284+ {"name" : "VLLM_ALL2ALL_BACKEND" , "value" : "naive " },
278285 {"name" : "PYTORCH_CUDA_ALLOC_CONF" , "value" : "expandable_segments:True" },
279- # Essential NCCL configuration
280- {"name" : "NCCL_IB_GID_INDEX" , "value" : "3" },
286+ {"name" : "NCCL_IB_DISABLE" , "value" : "1" },
287+ {"name" : "NCCL_NET_GDR_LEVEL" , "value" : "0" },
288+ {"name" : "NCCL_P2P_LEVEL" , "value" : "NVL" },
289+ {"name" : "NCCL_SOCKET_IFNAME" , "value" : "eth0" },
290+ {"name" : "NCCL_NSOCKS_PERTHREAD" , "value" : "2" },
291+ {"name" : "NCCL_SOCKET_NTHREADS" , "value" : "2" },
292+ {"name" : "NCCL_BUFFSIZE" , "value" : "2097152" },
281293 {"name" : "NCCL_DEBUG" , "value" : "WARN" },
282- {"name" : "NCCL_SOCKET_IFNAME" , "value" : "net1" },
283- {"name" : "NCCL_IB_TIMEOUT" , "value" : "100" },
284- # NVSHMEM configuration - optimized for stability
285- {"name" : "NVSHMEM_REMOTE_TRANSPORT" , "value" : "ibgda" },
294+ {"name" : "NVSHMEM_REMOTE_TRANSPORT" , "value" : "ucx" },
295+ {"name" : "NVSHMEM_DISABLE_CUDA_VMM" , "value" : "0" },
286296 {"name" : "NVSHMEM_BOOTSTRAP_TWO_STAGE" , "value" : "1" },
287297 {"name" : "NVSHMEM_BOOTSTRAP_TIMEOUT" , "value" : "300" },
288- {"name" : "NVSHMEM_BOOTSTRAP_UID_SOCK_IFNAME" , "value" : "net1" },
289- {"name" : "NVSHMEM_IB_GID_INDEX" , "value" : "3" },
290- {"name" : "NVSHMEM_USE_IBGDA" , "value" : "1" },
291- {"name" : "NVSHMEM_ENABLE_NIC_PE_MAPPING" , "value" : "1" },
292- {"name" : "NVSHMEM_IBGDA_SUPPORT" , "value" : "1" },
293- {"name" : "NVSHMEM_IB_ENABLE_IBGDA" , "value" : "1" },
294- {"name" : "NVSHMEM_IBGDA_NIC_HANDLER" , "value" : "gpu" },
295- {"name" : "NVSHMEM_DEBUG" , "value" : "WARN" },
296- # UCX configuration for NVSHMEM
297- {"name" : "UCX_TLS" , "value" : "rc,sm,self,cuda_copy,cuda_ipc" },
298- {"name" : "UCX_IB_GID_INDEX" , "value" : "3" },
299- {"name" : "UCX_RC_MLX5_TM_ENABLE" , "value" : "n" },
300- {"name" : "UCX_UD_MLX5_RX_QUEUE_LEN" , "value" : "1024" },
298+ {"name" : "NVSHMEM_BOOTSTRAP_UID_SOCK_IFNAME" , "value" : "eth0" },
299+ {"name" : "NVSHMEM_DEBUG" , "value" : "INFO" },
300+ {"name" : "UCX_TLS" , "value" : "tcp,sm,self,cuda_copy,cuda_ipc" },
301+ {"name" : "UCX_NET_DEVICES" , "value" : "eth0" },
301302 {"name" : "NVIDIA_GDRCOPY" , "value" : "enabled" },
302303 ]
303304
304305 container_resources = {
305306 "limits" : {
306307 "cpu" : "128" ,
307- "ephemeral-storage" : "800Gi " ,
308+ "ephemeral-storage" : "100Gi " ,
308309 "memory" : "512Gi" ,
309310 "nvidia.com/gpu" : "8" ,
310- "rdma/roce_gdr" : "1" ,
311311 },
312312 "requests" : {
313313 "cpu" : "64" ,
314- "ephemeral-storage" : "800Gi " ,
314+ "ephemeral-storage" : "100Gi " ,
315315 "memory" : "256Gi" ,
316316 "nvidia.com/gpu" : "8" ,
317- "rdma/roce_gdr" : "1" ,
318317 },
319318 }
320319
@@ -327,7 +326,7 @@ def deepseek_r1_inference_service(
327326 }
328327
329328 parallelism_config = {
330- "data" : 32 ,
329+ "data" : 16 ,
331330 "dataLocal" : 8 ,
332331 "expert" : True ,
333332 "tensor" : 1 ,
@@ -340,7 +339,6 @@ def deepseek_r1_inference_service(
340339 }
341340
342341 worker_spec = {
343- "serviceAccountName" : "hfsa" ,
344342 "containers" : [
345343 {
346344 "name" : "main" ,
@@ -351,25 +349,191 @@ def deepseek_r1_inference_service(
351349 }
352350
353351 annotations = {
354- "k8s.v1.cni.cncf. io/networks " : "roce-p2 " ,
352+ "security.opendatahub. io/enable-network-policies " : "false " ,
355353 }
356354
357355 with create_llmisvc (
358356 client = admin_client ,
359357 name = service_name ,
360358 namespace = unprivileged_model_namespace .name ,
361- storage_uri = ModelStorage . HF_DEEPSEEK_R1_0528 ,
362- model_name = "deepseek-ai/DeepSeek-R1-0528" ,
359+ storage_uri = storage_uri ,
360+ model_name = model_name ,
363361 replicas = 1 ,
364362 parallelism = parallelism_config ,
365363 router_config = router_config ,
366364 container_env = common_env ,
367365 container_resources = container_resources ,
368366 liveness_probe = liveness_probe ,
369- service_account = "hfsa" ,
370367 worker_config = worker_spec ,
371368 annotations = annotations ,
372369 wait = True ,
373370 timeout = Timeout .TIMEOUT_30MIN ,
374371 ) as llm_service :
375372 yield llm_service
373+
374+
375+ @pytest .fixture (scope = "class" )
376+ def llmisvc_singlenode_prefill_decode (
377+ request : FixtureRequest ,
378+ admin_client : DynamicClient ,
379+ unprivileged_model_namespace : Namespace ,
380+ ) -> Generator [LLMInferenceService , None , None ]:
381+ """Fixture for single-node GPU LLMInferenceService with prefill-decode separation."""
382+ # Extract parameters from pytest.mark.parametrize or use defaults
383+ params = getattr (request , "param" , {})
384+ if not isinstance (params , dict ):
385+ params = {}
386+
387+ service_name = params .get ("service_name" , "qwen2-7b-instruct-pd" )
388+ storage_uri = params .get ("storage_uri" , "hf://Qwen/Qwen2.5-7B-Instruct" )
389+ model_name = params .get ("model_name" , "Qwen/Qwen2.5-7B-Instruct" )
390+ decode_replicas = params .get ("decode_replicas" , 1 )
391+ prefill_replicas = params .get ("prefill_replicas" , 2 )
392+
393+ # Common environment variables for both prefill and decode (template)
394+ common_env = [
395+ # Enable RDMA for KV cache transfer
396+ {"name" : "KSERVE_INFER_ROCE" , "value" : "true" },
397+ # Pod IP for KV transfer side channel
398+ {
399+ "name" : "VLLM_NIXL_SIDE_CHANNEL_HOST" ,
400+ "valueFrom" : {"fieldRef" : {"fieldPath" : "status.podIP" }},
401+ },
402+ # Enable KV cache transfer via NixlConnector (RDMA-based)
403+ {
404+ "name" : "VLLM_ADDITIONAL_ARGS" ,
405+ "value" : '--kv_transfer_config \' {"kv_connector":"NixlConnector","kv_role":"kv_both"}\' ' ,
406+ },
407+ # UCX configuration for RDMA transport
408+ {"name" : "UCX_PROTO_INFO" , "value" : "y" },
409+ {"name" : "UCX_TLS" , "value" : "rc,sm,self,cuda_copy,cuda_ipc" },
410+ ]
411+
412+ container_resources = {
413+ "limits" : {
414+ "cpu" : "4" ,
415+ "memory" : "32Gi" ,
416+ "nvidia.com/gpu" : "1" ,
417+ "rdma/roce_gdr" : "1" ,
418+ },
419+ "requests" : {
420+ "cpu" : "2" ,
421+ "memory" : "16Gi" ,
422+ "nvidia.com/gpu" : "1" ,
423+ "rdma/roce_gdr" : "1" ,
424+ },
425+ }
426+
427+ liveness_probe = {
428+ "httpGet" : {"path" : "/health" , "port" : 8000 , "scheme" : "HTTPS" },
429+ "initialDelaySeconds" : 120 ,
430+ "periodSeconds" : 30 ,
431+ "timeoutSeconds" : 30 ,
432+ "failureThreshold" : 5 ,
433+ }
434+
435+ # Scheduler config text for prefill-decode separation
436+ scheduler_config_text = """apiVersion: inference.networking.x-k8s.io/v1alpha1
437+ kind: EndpointPickerConfig
438+ plugins:
439+ - type: prefill-header-handler
440+ - type: prefill-filter
441+ - type: decode-filter
442+ - type: max-score-picker
443+ - type: queue-scorer
444+ parameters:
445+ hashBlockSize: 5
446+ maxPrefixBlocksToMatch: 256
447+ lruCapacityPerServer: 31250
448+ - type: pd-profile-handler
449+ parameters:
450+ threshold: 0
451+ hashBlockSize: 5
452+ schedulingProfiles:
453+ - name: prefill
454+ plugins:
455+ - pluginRef: prefill-filter
456+ - pluginRef: queue-scorer
457+ weight: 1.0
458+ - pluginRef: max-score-picker
459+ - name: decode
460+ plugins:
461+ - pluginRef: decode-filter
462+ - pluginRef: queue-scorer
463+ weight: 1.0
464+ - pluginRef: max-score-picker
465+ """
466+
467+ # Router config with scheduler configuration for prefill-decode separation
468+ router_config = {
469+ "route" : {},
470+ "gateway" : {},
471+ "scheduler" : {
472+ "template" : {
473+ "containers" : [
474+ {
475+ "name" : "main" ,
476+ "args" : [
477+ "--pool-name" ,
478+ "{{ ChildName .ObjectMeta.Name `-inference-pool` }}" ,
479+ "--pool-namespace" ,
480+ "{{ .ObjectMeta.Namespace }}" ,
481+ "--zap-encoder" ,
482+ "json" ,
483+ "--grpc-port" ,
484+ "9002" ,
485+ "--grpc-health-port" ,
486+ "9003" ,
487+ "--secure-serving" ,
488+ "--model-server-metrics-scheme" ,
489+ "https" ,
490+ "--model-server-metrics-https-insecure-skip-verify" ,
491+ "--cert-path" ,
492+ "/etc/ssl/certs" ,
493+ "--config-text" ,
494+ scheduler_config_text ,
495+ ],
496+ }
497+ ]
498+ }
499+ },
500+ }
501+
502+ # Prefill configuration
503+ prefill_config = {
504+ "replicas" : prefill_replicas ,
505+ "template" : {
506+ "containers" : [
507+ {
508+ "name" : "main" ,
509+ "env" : common_env ,
510+ "resources" : container_resources ,
511+ "livenessProbe" : liveness_probe ,
512+ }
513+ ]
514+ },
515+ }
516+
517+ # Annotations for RoCE network
518+ annotations = {
519+ # RoCE network required for KV cache transfer via RDMA
520+ "k8s.v1.cni.cncf.io/networks" : "roce-p2" ,
521+ }
522+
523+ with create_llmisvc (
524+ client = admin_client ,
525+ name = service_name ,
526+ namespace = unprivileged_model_namespace .name ,
527+ storage_uri = storage_uri ,
528+ model_name = model_name ,
529+ replicas = decode_replicas ,
530+ router_config = router_config ,
531+ container_env = common_env ,
532+ container_resources = container_resources ,
533+ liveness_probe = liveness_probe ,
534+ prefill_config = prefill_config ,
535+ annotations = annotations ,
536+ wait = True ,
537+ timeout = Timeout .TIMEOUT_30MIN ,
538+ ) as llm_service :
539+ yield llm_service
0 commit comments