llm-d-modelservice/examples/values-heterogeneous-pd.yaml at main · llm-d-incubation/llm-d-modelservice · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
# This values.yaml file demonstrates heterogeneous accelerator support for P/D disaggregation
# Uses NVIDIA GPUs for decode (high performance for token generation)
# Uses Intel Gaudi for prefill (cost-effective for prompt processing)
# Based on values-pd.yaml with added accelerator type overrides

# When true, LeaderWorkerSet is used instead of Deployment
multinode: false

# Global accelerator configuration (fallback for components without specific type)
accelerator:
  type: nvidia

modelArtifacts:
# This is the model name used to start vLLM.
  name: facebook/opt-125m
  labels:
    llm-d.ai/inference-serving: "true"
    llm-d.ai/model: facebook-opt-125m
  uri: hf://"{{ .Values.modelArtifacts.name }}"
  size: 20Gi

# Describe routing requirements. In addition to service level routing (OpenAI model name, service port)
# also describes elements for Gateway API Inference Extension configuration
routing:
  servicePort: 8000

  # other fields are inherited from chart values.yaml
  proxy:
    secure: false

# Decode pod configuration - Uses NVIDIA GPUs with DRA for high-performance token generation
decode:
  create: true
  replicas: 1

  # Override accelerator configuration for decode pods
  # Using DRA (Dynamic Resource Allocation) for decode
  accelerator:
    # type: nvidia  # Not specified - uses global accelerator.type
    dra: true  # Enable DRA for decode pods

  parallelism:
    tensor: 2
    data: 1
    dataLocal: 1
    workers: 1

  containers:
  - name: "vllm"
    image: "ghcr.io/llm-d/llm-d:v0.2.0"
    modelCommand: vllmServe
    args:
      - "--enforce-eager"
      - "--kv-transfer-config"
      - '{"kv_connector":"NixlConnector", "kv_role":"kv_both"}'
    env:
      - name: CUDA_VISIBLE_DEVICES
        value: "0"
      - name: UCX_TLS
        value: "cuda_ipc,cuda_copy,tcp"
      - name: VLLM_NIXL_SIDE_CHANNEL_HOST
        valueFrom:
          fieldRef:
            fieldPath: status.podIP
      - name: VLLM_NIXL_SIDE_CHANNEL_PORT
        value: "5600"
      - name: VLLM_LOGGING_LEVEL
        value: DEBUG
    ports:
      - containerPort: 8200  # from routing.proxy.targetPort
        protocol: TCP
      - containerPort: 5600  # NIXL side channel
        protocol: TCP
    resources:
      limits:
        memory: 16Gi
        cpu: "16"
      requests:
        cpu: "16"
        memory: 16Gi
    mountModelVolume: true

# Prefill pod configuration - Uses Intel Gaudi for cost-effective prompt processing
prefill:
  create: true
  replicas: 1

  # # Override accelerator configuration for prefill pods
  # # Using device plugin mode (not DRA) for prefill
  # accelerator:
  #   type: intel-gaudi
  #   # dra: false  # Not specified - uses device plugin mode (global default)

  parallelism:
    tensor: 1
    data: 1
    dataLocal: 1
    workers: 1

  containers:
  - name: "vllm"
    # Using Intel-optimized vLLM image for Gaudi
    image: "vault.habana.ai/gaudi-docker/1.18.0/ubuntu22.04/habanalabs/pytorch-installer-2.4.0:latest"
    modelCommand: vllmServe
    args:
      - "--enforce-eager"
      - "--kv-transfer-config"
      - '{"kv_connector":"NixlConnector", "kv_role":"kv_both"}'
    env:
      - name: HABANA_VISIBLE_DEVICES
        value: "0"
      - name: UCX_TLS
        value: "tcp"
      - name: VLLM_NIXL_SIDE_CHANNEL_PORT
        value: "5600"
      - name: VLLM_NIXL_SIDE_CHANNEL_HOST
        valueFrom:
          fieldRef:
            fieldPath: status.podIP
      - name: VLLM_LOGGING_LEVEL
        value: DEBUG
    ports:
      - containerPort: 8000  # from routing.servicePort
        protocol: TCP
      - containerPort: 5600  # NIXL side channel
        protocol: TCP
    resources:
      limits:
        memory: 16Gi
        cpu: "16"
      requests:
        cpu: "16"
        memory: 16Gi
    mountModelVolume: true