llm-d-modelservice/examples/values-xpu-pd.yaml at main · llm-d-incubation/llm-d-modelservice · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
# Example values for Intel XPU with Prefill/Decode disaggregation
# WARNING: Intel XPU does not currently support P/D disaggregation
# This configuration is for reference only - testing shows:
# - SimpleConnector: Not supported
# - NixlConnector: Requires NIXL library which is not available
# Use values-xpu.yaml for working Intel XPU deployment instead

modelArtifacts:
  name: microsoft/DialoGPT-large
  labels:
    llm-d.ai/inference-serving: "true"
    llm-d.ai/model: microsoft-dialogpt-large
  uri: "hf://microsoft/DialoGPT-large"
  size: 10Gi

accelerator:
  type: "intel-i915"

routing:
  servicePort: 8000
  proxy:
    image: ghcr.io/llm-d/llm-d-routing-sidecar:latest
    targetPort: 8200
    # Use compatible connector for XPU
    connector: nixlv2

# Decode pod configuration for Intel XPU
decode:
  create: true
  replicas: 1  # Just 1 decode pod
  containers:
  - name: "vllm"
    image: "ghcr.io/llm-d/llm-d-xpu:latest"
    imagePullPolicy: Never
    modelCommand: custom
    command: ["python3", "-m", "vllm.entrypoints.openai.api_server"]
    args:
      - "--model"
      - "microsoft/DialoGPT-large"
      - "--enforce-eager"
      - "--tensor-parallel-size"
      - "1"  # TP=1 for each decode pod (using 1 XPU each)
      - "--port"
      - "8200"
      - "--host"
      - "0.0.0.0"
      - "--kv-transfer-config"
      - '{"kv_connector":"NixlConnector", "kv_role":"kv_consumer"}'
    env:
      - name: ZE_AFFINITY_MASK
        value: "0"  # Decode pod uses XPU 0
      - name: ZE_ENABLE_PCI_ID_DEVICE_ORDER
        value: "1"
      - name: VLLM_NIXL_SIDE_CHANNEL_HOST
        valueFrom:
          fieldRef:
            fieldPath: status.podIP
      - name: VLLM_NIXL_SIDE_CHANNEL_PORT
        value: "5600"
      - name: VLLM_LOGGING_LEVEL
        value: DEBUG
      # Intel XPU specific environment variables
      - name: TORCH_LLM_ALLREDUCE
        value: "1"
      - name: VLLM_USE_V1
        value: "1"
      - name: CCL_ZE_IPC_EXCHANGE
        value: "pidfd"
      - name: VLLM_ALLOW_LONG_MAX_MODEL_LEN
        value: "1"
      - name: VLLM_WORKER_MULTIPROC_METHOD
        value: "spawn"
    ports:
      - containerPort: 8200
        protocol: TCP
      - containerPort: 5600  # NIXL side channel
        protocol: TCP
    resources:
      limits:
        memory: 24Gi  # Reduced memory since each pod uses 1 XPU
        cpu: "8"      # Reduced CPU
        gpu.intel.com/i915: "1"  # Each decode pod uses 1 XPU
      requests:
        cpu: "4"
        memory: 12Gi
        gpu.intel.com/i915: "1"
    mountModelVolume: true

  acceleratorTypes:
    labelKey: "accelerator"
    labelValues:
      - "intel-xpu"

# Prefill pod configuration for Intel XPU
prefill:
  create: true
  replicas: 1  # Just 1 prefill pod
  containers:
  - name: "vllm"
    image: "ghcr.io/llm-d/llm-d-xpu:latest"
    modelCommand: custom
    command: ["python3", "-m", "vllm.entrypoints.openai.api_server"]
    args:
      - "--model"
      - "microsoft/DialoGPT-large"
      - "--enforce-eager"
      - "--tensor-parallel-size"
      - "1"  # TP=1 for prefill
      - "--port"
      - "8000"
      - "--host"
      - "0.0.0.0"
      - "--kv-transfer-config"
      - '{"kv_connector":"NixlConnector", "kv_role":"kv_producer"}'
    env:
      - name: ZE_AFFINITY_MASK
        value: "1"  # Prefill pod uses XPU 1
      - name: ZE_ENABLE_PCI_ID_DEVICE_ORDER
        value: "1"
      - name: VLLM_NIXL_SIDE_CHANNEL_PORT
        value: "5600"
      - name: VLLM_NIXL_SIDE_CHANNEL_HOST
        valueFrom:
          fieldRef:
            fieldPath: status.podIP
      - name: VLLM_LOGGING_LEVEL
        value: DEBUG
      # Intel XPU specific environment variables
      - name: TORCH_LLM_ALLREDUCE
        value: "1"
      - name: VLLM_USE_V1
        value: "1"
      - name: CCL_ZE_IPC_EXCHANGE
        value: "pidfd"
      - name: VLLM_ALLOW_LONG_MAX_MODEL_LEN
        value: "1"
      - name: VLLM_WORKER_MULTIPROC_METHOD
        value: "spawn"
    ports:
      - containerPort: 8000
        protocol: TCP
      - containerPort: 5600
        protocol: TCP
    resources:
      limits:
        memory: 32Gi
        cpu: "16"
      requests:
        cpu: "8"
        memory: 16Gi
    mountModelVolume: true

  acceleratorTypes:
    labelKey: "accelerator"
    labelValues:
      - "intel-xpu"

multinode: false