Skip to content

Commit b648be2

Browse files
feat: create dp ep llmisvc
1 parent c5e33e5 commit b648be2

3 files changed

Lines changed: 118 additions & 0 deletions

File tree

tests/model_serving/model_server/llmd/conftest.py

Lines changed: 116 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -257,3 +257,119 @@ def llmd_inference_service_gpu(
257257

258258
with create_llmisvc(**create_kwargs) as llm_service:
259259
yield llm_service
260+
261+
262+
@pytest.fixture(scope="class")
263+
def deepseek_r1_inference_service(
264+
admin_client: DynamicClient,
265+
unprivileged_model_namespace: Namespace,
266+
) -> Generator[LLMInferenceService, None, None]:
267+
"""Fixture for DeepSeek R1 0528 model with multi-node configuration."""
268+
service_name = "deepseek-r1-0528"
269+
270+
# Define common environment variables for both template and worker
271+
common_env = [
272+
{"name": "VLLM_LOGGING_LEVEL", "value": "INFO"},
273+
{"name": "KSERVE_INFER_ROCE", "value": "true"},
274+
{"name": "CUDA_DEVICE_ORDER", "value": "PCI_BUS_ID"},
275+
# Memory optimizations
276+
{"name": "VLLM_ADDITIONAL_ARGS", "value": "--gpu-memory-utilization 0.95 --max-model-len 8192 --enforce-eager"},
277+
{"name": "VLLM_ALL2ALL_BACKEND", "value": "deepep_high_throughput"},
278+
{"name": "PYTORCH_CUDA_ALLOC_CONF", "value": "expandable_segments:True"},
279+
# Essential NCCL configuration
280+
{"name": "NCCL_IB_GID_INDEX", "value": "3"},
281+
{"name": "NCCL_DEBUG", "value": "WARN"},
282+
{"name": "NCCL_SOCKET_IFNAME", "value": "net1"},
283+
{"name": "NCCL_IB_TIMEOUT", "value": "100"},
284+
# NVSHMEM configuration - optimized for stability
285+
{"name": "NVSHMEM_REMOTE_TRANSPORT", "value": "ibgda"},
286+
{"name": "NVSHMEM_BOOTSTRAP_TWO_STAGE", "value": "1"},
287+
{"name": "NVSHMEM_BOOTSTRAP_TIMEOUT", "value": "300"},
288+
{"name": "NVSHMEM_BOOTSTRAP_UID_SOCK_IFNAME", "value": "net1"},
289+
{"name": "NVSHMEM_IB_GID_INDEX", "value": "3"},
290+
{"name": "NVSHMEM_USE_IBGDA", "value": "1"},
291+
{"name": "NVSHMEM_ENABLE_NIC_PE_MAPPING", "value": "1"},
292+
{"name": "NVSHMEM_IBGDA_SUPPORT", "value": "1"},
293+
{"name": "NVSHMEM_IB_ENABLE_IBGDA", "value": "1"},
294+
{"name": "NVSHMEM_IBGDA_NIC_HANDLER", "value": "gpu"},
295+
{"name": "NVSHMEM_DEBUG", "value": "WARN"},
296+
# UCX configuration for NVSHMEM
297+
{"name": "UCX_TLS", "value": "rc,sm,self,cuda_copy,cuda_ipc"},
298+
{"name": "UCX_IB_GID_INDEX", "value": "3"},
299+
{"name": "UCX_RC_MLX5_TM_ENABLE", "value": "n"},
300+
{"name": "UCX_UD_MLX5_RX_QUEUE_LEN", "value": "1024"},
301+
{"name": "NVIDIA_GDRCOPY", "value": "enabled"},
302+
]
303+
304+
container_resources = {
305+
"limits": {
306+
"cpu": "128",
307+
"ephemeral-storage": "800Gi",
308+
"memory": "512Gi",
309+
"nvidia.com/gpu": "8",
310+
"rdma/roce_gdr": "1",
311+
},
312+
"requests": {
313+
"cpu": "64",
314+
"ephemeral-storage": "800Gi",
315+
"memory": "256Gi",
316+
"nvidia.com/gpu": "8",
317+
"rdma/roce_gdr": "1",
318+
},
319+
}
320+
321+
liveness_probe = {
322+
"httpGet": {"path": "/health", "port": 8000, "scheme": "HTTPS"},
323+
"initialDelaySeconds": 4800,
324+
"periodSeconds": 10,
325+
"timeoutSeconds": 10,
326+
"failureThreshold": 3,
327+
}
328+
329+
parallelism_config = {
330+
"data": 32,
331+
"dataLocal": 8,
332+
"expert": True,
333+
"tensor": 1,
334+
}
335+
336+
router_config = {
337+
"scheduler": {},
338+
"route": {},
339+
"gateway": {},
340+
}
341+
342+
worker_spec = {
343+
"serviceAccountName": "hfsa",
344+
"containers": [
345+
{
346+
"name": "main",
347+
"env": common_env,
348+
"resources": container_resources,
349+
}
350+
],
351+
}
352+
353+
annotations = {
354+
"k8s.v1.cni.cncf.io/networks": "roce-p2",
355+
}
356+
357+
with create_llmisvc(
358+
client=admin_client,
359+
name=service_name,
360+
namespace=unprivileged_model_namespace.name,
361+
storage_uri=ModelStorage.HF_DEEPSEEK_R1_0528,
362+
model_name="deepseek-ai/DeepSeek-R1-0528",
363+
replicas=1,
364+
parallelism=parallelism_config,
365+
router_config=router_config,
366+
container_env=common_env,
367+
container_resources=container_resources,
368+
liveness_probe=liveness_probe,
369+
service_account="hfsa",
370+
worker_config=worker_spec,
371+
annotations=annotations,
372+
wait=True,
373+
timeout=Timeout.TIMEOUT_30MIN,
374+
) as llm_service:
375+
yield llm_service

utilities/constants.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -295,6 +295,7 @@ class S3:
295295

296296
class HuggingFace:
297297
TINYLLAMA: str = "hf://TinyLlama/TinyLlama-1.1B-Chat-v1.0"
298+
DEEPSEEK_R1_0528: str = "hf://deepseek-ai/DeepSeek-R1-0528"
298299

299300

300301
class OCIRegistry:

utilities/llmd_constants.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -35,6 +35,7 @@ class ModelStorage:
3535
TINYLLAMA_S3: str = SharedModelStorage.S3.TINYLLAMA
3636
S3_QWEN: str = SharedModelStorage.S3.QWEN_7B_INSTRUCT
3737
HF_TINYLLAMA: str = SharedModelStorage.HuggingFace.TINYLLAMA
38+
HF_DEEPSEEK_R1_0528: str = SharedModelStorage.HuggingFace.DEEPSEEK_R1_0528
3839

3940

4041
class ContainerImages:

0 commit comments

Comments
 (0)