@@ -257,3 +257,119 @@ def llmd_inference_service_gpu(
257257
258258 with create_llmisvc (** create_kwargs ) as llm_service :
259259 yield llm_service
260+
261+
262+ @pytest .fixture (scope = "class" )
263+ def deepseek_r1_inference_service (
264+ admin_client : DynamicClient ,
265+ unprivileged_model_namespace : Namespace ,
266+ ) -> Generator [LLMInferenceService , None , None ]:
267+ """Fixture for DeepSeek R1 0528 model with multi-node configuration."""
268+ service_name = "deepseek-r1-0528"
269+
270+ # Define common environment variables for both template and worker
271+ common_env = [
272+ {"name" : "VLLM_LOGGING_LEVEL" , "value" : "INFO" },
273+ {"name" : "KSERVE_INFER_ROCE" , "value" : "true" },
274+ {"name" : "CUDA_DEVICE_ORDER" , "value" : "PCI_BUS_ID" },
275+ # Memory optimizations
276+ {"name" : "VLLM_ADDITIONAL_ARGS" , "value" : "--gpu-memory-utilization 0.95 --max-model-len 8192 --enforce-eager" },
277+ {"name" : "VLLM_ALL2ALL_BACKEND" , "value" : "deepep_high_throughput" },
278+ {"name" : "PYTORCH_CUDA_ALLOC_CONF" , "value" : "expandable_segments:True" },
279+ # Essential NCCL configuration
280+ {"name" : "NCCL_IB_GID_INDEX" , "value" : "3" },
281+ {"name" : "NCCL_DEBUG" , "value" : "WARN" },
282+ {"name" : "NCCL_SOCKET_IFNAME" , "value" : "net1" },
283+ {"name" : "NCCL_IB_TIMEOUT" , "value" : "100" },
284+ # NVSHMEM configuration - optimized for stability
285+ {"name" : "NVSHMEM_REMOTE_TRANSPORT" , "value" : "ibgda" },
286+ {"name" : "NVSHMEM_BOOTSTRAP_TWO_STAGE" , "value" : "1" },
287+ {"name" : "NVSHMEM_BOOTSTRAP_TIMEOUT" , "value" : "300" },
288+ {"name" : "NVSHMEM_BOOTSTRAP_UID_SOCK_IFNAME" , "value" : "net1" },
289+ {"name" : "NVSHMEM_IB_GID_INDEX" , "value" : "3" },
290+ {"name" : "NVSHMEM_USE_IBGDA" , "value" : "1" },
291+ {"name" : "NVSHMEM_ENABLE_NIC_PE_MAPPING" , "value" : "1" },
292+ {"name" : "NVSHMEM_IBGDA_SUPPORT" , "value" : "1" },
293+ {"name" : "NVSHMEM_IB_ENABLE_IBGDA" , "value" : "1" },
294+ {"name" : "NVSHMEM_IBGDA_NIC_HANDLER" , "value" : "gpu" },
295+ {"name" : "NVSHMEM_DEBUG" , "value" : "WARN" },
296+ # UCX configuration for NVSHMEM
297+ {"name" : "UCX_TLS" , "value" : "rc,sm,self,cuda_copy,cuda_ipc" },
298+ {"name" : "UCX_IB_GID_INDEX" , "value" : "3" },
299+ {"name" : "UCX_RC_MLX5_TM_ENABLE" , "value" : "n" },
300+ {"name" : "UCX_UD_MLX5_RX_QUEUE_LEN" , "value" : "1024" },
301+ {"name" : "NVIDIA_GDRCOPY" , "value" : "enabled" },
302+ ]
303+
304+ container_resources = {
305+ "limits" : {
306+ "cpu" : "128" ,
307+ "ephemeral-storage" : "800Gi" ,
308+ "memory" : "512Gi" ,
309+ "nvidia.com/gpu" : "8" ,
310+ "rdma/roce_gdr" : "1" ,
311+ },
312+ "requests" : {
313+ "cpu" : "64" ,
314+ "ephemeral-storage" : "800Gi" ,
315+ "memory" : "256Gi" ,
316+ "nvidia.com/gpu" : "8" ,
317+ "rdma/roce_gdr" : "1" ,
318+ },
319+ }
320+
321+ liveness_probe = {
322+ "httpGet" : {"path" : "/health" , "port" : 8000 , "scheme" : "HTTPS" },
323+ "initialDelaySeconds" : 4800 ,
324+ "periodSeconds" : 10 ,
325+ "timeoutSeconds" : 10 ,
326+ "failureThreshold" : 3 ,
327+ }
328+
329+ parallelism_config = {
330+ "data" : 32 ,
331+ "dataLocal" : 8 ,
332+ "expert" : True ,
333+ "tensor" : 1 ,
334+ }
335+
336+ router_config = {
337+ "scheduler" : {},
338+ "route" : {},
339+ "gateway" : {},
340+ }
341+
342+ worker_spec = {
343+ "serviceAccountName" : "hfsa" ,
344+ "containers" : [
345+ {
346+ "name" : "main" ,
347+ "env" : common_env ,
348+ "resources" : container_resources ,
349+ }
350+ ],
351+ }
352+
353+ annotations = {
354+ "k8s.v1.cni.cncf.io/networks" : "roce-p2" ,
355+ }
356+
357+ with create_llmisvc (
358+ client = admin_client ,
359+ name = service_name ,
360+ namespace = unprivileged_model_namespace .name ,
361+ storage_uri = ModelStorage .HF_DEEPSEEK_R1_0528 ,
362+ model_name = "deepseek-ai/DeepSeek-R1-0528" ,
363+ replicas = 1 ,
364+ parallelism = parallelism_config ,
365+ router_config = router_config ,
366+ container_env = common_env ,
367+ container_resources = container_resources ,
368+ liveness_probe = liveness_probe ,
369+ service_account = "hfsa" ,
370+ worker_config = worker_spec ,
371+ annotations = annotations ,
372+ wait = True ,
373+ timeout = Timeout .TIMEOUT_30MIN ,
374+ ) as llm_service :
375+ yield llm_service
0 commit comments