llm-d-modelservice/examples/values-xpu.yaml at main · llm-d-incubation/llm-d-modelservice · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
# Simplified Intel XPU configuration using imageDefault mode
# This configuration lets the chart handle most vLLM parameters automatically

modelArtifacts:
  name: deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B
  labels:
    llm-d.ai/inference-serving: "true"
    llm-d.ai/model: deepseek-ai-deepSeek-r1-distill-qwen-1-5B
  uri: "hf://deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B"
  size: 10Gi

accelerator:
  type: "intel-xe"

# Routing configuration
routing:
  proxy:
    enabled: false

# Decode pod configuration for Intel XPU - simplified with imageDefault
decode:
  create: true
  replicas: 1
  containers:
  - name: "vllm"
    # Use custom vLLM image for XPU
    image: "ghcr.io/llm-d/llm-d-xpu:latest"

    # Use imageDefault mode - chart will generate basic vLLM command automatically
    modelCommand: "imageDefault"

    # Only specify XPU-specific arguments that differ from defaults
    args:
      - "--enforce-eager"
      - "--dtype"
      - "float16"
      - "--disable-sliding-window"
      - "--gpu-memory-util"
      - "0.9"
      - "--no-enable-prefix-caching"
      - "--max-num-batched-tokens"
      - "4096"
      - "--disable-log-requests"
      - "--max-model-len"
      - "4096"
      - "--block-size"
      - "64"

    ports:
      - containerPort: 8200
        protocol: TCP
    resources:
      limits:
        memory: 24Gi
        cpu: "8"
      requests:
        cpu: "4"
        memory: 12Gi

    mountModelVolume: true

  # XPU-specific node affinity
  acceleratorTypes:
    labelKey: "accelerator"
    labelValues:
      - "intel-xpu"
      - "intel-gpu-max"

# Disable prefill for simple XPU example
prefill:
  create: false

# When true, use LeaderWorkerSet for multi-node XPU setups
multinode: false