llm-d-modelservice/examples/values-requester.yaml at main · llm-d-incubation/llm-d-modelservice · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
# This values.yaml file creates the resources for a P/D disaggregation scenario
# Uses a small model: facebook/opt-125m
# See also defaults in chart values.yaml

# When true, LeaderWorkerSet is used instead of Deployment
multinode: false

modelArtifacts:
  # This is the model name used to start vLLM.
  name: facebook/opt-125m
  labels:
    llm-d.ai/inference-serving: "true"
    llm-d.ai/model: facebook-opt-125m
  uri: hf://"{{ .Values.modelArtifacts.name }}"
  size: 20Gi

# Describe routing requirements. In addition to service level routing (OpenAI model name, service port)
# also describes elements for Gateway API Inference Extension configuration
routing:
  servicePort: 8000

  # other fields are inherited from chart values.yaml
  proxy:
    secure: false

# @schema
# additionalProperties: true
# @schema
# -- Requester configuration part of the dual-pod solution for FMA
requester:
  enable: true
  image: "ghcr.io/llm-d-incubation/llm-d-fast-model-actuation/requester:latest"
  port:
    probes: 8080
    spi: 8081
  readinessProbe:
    initialDelaySeconds: 2
    periodSeconds: 5
  resources:
    limits:
      gpus: 1
      cpus: 1
      memory: 250Mi

# Decode pod configuation
decode:
  create: true
  replicas: 1
  containers:
    - name: "vllm"
      image: "ghcr.io/llm-d/llm-d-cuda:latest"
      modelCommand: vllmServe
      args:
        - "--enforce-eager"
        - "--kv-transfer-config"
        - '{"kv_connector":"NixlConnector", "kv_role":"kv_both"}'
      env:
        - name: CUDA_VISIBLE_DEVICES
          value: "0"
        - name: UCX_TLS
          value: "cuda_ipc,cuda_copy,tcp"
        - name: VLLM_NIXL_SIDE_CHANNEL_HOST
          valueFrom:
            fieldRef:
              fieldPath: status.podIP
        - name: VLLM_NIXL_SIDE_CHANNEL_PORT
          value: "5600"
        - name: VLLM_LOGGING_LEVEL
          value: DEBUG
      ports:
        - containerPort: 8200 # from routing.proxy.targetPort
          protocol: TCP
        - containerPort: 5600 # NIXL side channel
          protocol: TCP
      resources:
        limits:
          memory: 16Gi
          cpu: "16"
          nvidia.com/gpu: "1"
        requests:
          cpu: "16"
          memory: 16Gi
          nvidia.com/gpu: "1"
      mountModelVolume: true

# Prefill pod configuation
prefill:
  create: true
  replicas: 1
  containers:
    - name: "vllm"
      image: "ghcr.io/llm-d/llm-d-cuda:latest"
      modelCommand: vllmServe
      args:
        - "--enforce-eager"
        - "--kv-transfer-config"
        - '{"kv_connector":"NixlConnector", "kv_role":"kv_both"}'
      env:
        - name: CUDA_VISIBLE_DEVICES
          value: "0"
        - name: UCX_TLS
          value: "cuda_ipc,cuda_copy,tcp"
        - name: VLLM_NIXL_SIDE_CHANNEL_PORT
          value: "5600"
        - name: VLLM_NIXL_SIDE_CHANNEL_HOST
          valueFrom:
            fieldRef:
              fieldPath: status.podIP
        - name: VLLM_LOGGING_LEVEL
          value: DEBUG
      ports:
        - containerPort: 8000 # from routing.servicePort
          protocol: TCP
        - containerPort: 5600 # NIXL side channel
          protocol: TCP
      resources:
        limits:
          memory: 16Gi
          cpu: "16"
          nvidia.com/gpu: "1"
        requests:
          cpu: "16"
          memory: 16Gi
          nvidia.com/gpu: "1"
      mountModelVolume: true