aibrix/config/gateway/gateway-plugin/gateway-plugin.yaml at 029bac934a873c132febd643ea6f0301e4f23639 · vllm-project/aibrix · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
apiVersion: v1
kind: Service
metadata:
  name: gateway-plugins
  namespace: system
  labels:
    app: gateway-plugins
  annotations:
    prometheus.io/scrape: "true"
    prometheus.io/port: "8080"
    prometheus.io/path: "/metrics"
spec:
  selector:
    app: gateway-plugins
  ports:
    - name: gateway
      protocol: TCP
      port: 50052
      targetPort: 50052
    - name: profiling
      protocol: TCP
      port: 6060
      targetPort: 6060
    - name: metrics
      protocol: TCP
      port: 8080
      targetPort: 8080
---
apiVersion: apps/v1
kind: Deployment
metadata:
  name: gateway-plugins
  namespace: system
spec:
  strategy:
    type: RollingUpdate
    rollingUpdate:
      maxUnavailable: 1
      maxSurge: 1
  replicas: 1
  selector:
    matchLabels:
      app: gateway-plugins
  template:
    metadata:
      labels:
        app: gateway-plugins
    spec:
      affinity:
        podAntiAffinity: # pods are placed on different nodes
           preferredDuringSchedulingIgnoredDuringExecution:
            - weight: 100
              podAffinityTerm:
                labelSelector:
                  matchExpressions:
                    - key: app
                      operator: In
                      values:
                        - gateway-plugins
                topologyKey: "kubernetes.io/hostname"
        nodeAffinity: # prevent gateway pod to be placed on gpu node.
          preferredDuringSchedulingIgnoredDuringExecution:
          - weight: 100
            preference:
              matchExpressions:
                - key: nvidia.com/gpu.present
                  operator: NotIn
                  values:
                    - "true"
      initContainers:
        - name: init-c
          image: busybox
          command: ['sh', '-c', 'until echo "ping" | nc aibrix-redis-master 6379 -w 1  | grep -c PONG; do echo waiting for service aibrix-redis-master; sleep 2; done']
      containers:
        - name: gateway-plugin
          args:
            - --enable-leader-election=false
            #- --leader-election-id=gateway-plugin-lock
            #- --leader-election-namespace=aibrix-system
          image: gateway-plugins:latest
          imagePullPolicy: IfNotPresent
          ports:
            - name: gateway
              containerPort: 50052
            - name: profiling
              containerPort: 6060
            - name: metrics
              containerPort: 8080
          resources:
            limits:
              cpu: 1
              memory: 1Gi
            requests:
              cpu: 1
              memory: 1Gi
          env:
            - name: REDIS_HOST
              value: aibrix-redis-master
            - name: REDIS_PORT
              value: "6379"
            - name: AIBRIX_POD_METRIC_REFRESH_INTERVAL_MS
              value: "50"
            - name: AIBRIX_PREFIX_CACHE_TOKENIZER_TYPE
              value: "character"
            - name: AIBRIX_PREFIX_CACHE_BLOCK_SIZE
              value: "128"
            - name: AIBRIX_PREFIX_CACHE_POD_RUNNING_REQUEST_IMBALANCE_ABS_COUNT
              value: "16"
            - name: AIBRIX_PREFIX_CACHE_STANDARD_DEVIATION_FACTOR
              value: "2"
            - name: AIBRIX_PREFILL_REQUEST_TIMEOUT
              value: "60"
            # Uncomment to enable request tracing for GPU optimizer, default "false".
            # - name: AIBRIX_GPU_OPTIMIZER_TRACING_FLAG
            #   value: "true"
            - name: POD_NAME
              valueFrom:
                fieldRef:
                  fieldPath: metadata.name
            - name: POD_NAMESPACE
              valueFrom:
                fieldRef:
                  fieldPath: metadata.namespace
          livenessProbe:
            grpc:
              port: 50052
              service: liveness
            initialDelaySeconds: 5
            periodSeconds: 10
          readinessProbe:
            grpc:
              port: 50052
              service: readiness
            initialDelaySeconds: 5
            periodSeconds: 10
      serviceAccountName: aibrix-gateway-plugins
---
# this is a dummy route for incoming request to list models registered to aibrix-control-plane
# TODO (varun): check if this dummy route can be removed in future
apiVersion: gateway.networking.k8s.io/v1
kind: HTTPRoute
metadata:
  name: reserved-router-metadata-endpoint
  namespace: system
spec:
  parentRefs:
    - name: aibrix-eg
  rules:
    - matches:
        - path:
            type: PathPrefix
            value: /v1/models
        - path:
            type: PathPrefix
            value: /v1/files
        - path:
            type: PathPrefix
            value: /v1/batches
      backendRefs:
        - name: aibrix-metadata-service
          port: 8090
---
apiVersion: gateway.envoyproxy.io/v1alpha1
kind: EnvoyExtensionPolicy
metadata:
  name: skip-ext-proc
  namespace: system
spec:
  targetRef:
    group: gateway.networking.k8s.io
    kind: HTTPRoute
    name: aibrix-reserved-router-metadata-endpoint
---
# this is a dummy route for incoming request and,
# then request is routed to httproute using model name OR
# request is routed based on the target for that model service
# TODO (varun): check if this dummy route can be removed in future
apiVersion: gateway.networking.k8s.io/v1
kind: HTTPRoute
metadata:
  name: reserved-router
  namespace: system
spec:
  parentRefs:
    - name: aibrix-eg
  rules:
    - matches:
        - path:
            type: PathPrefix
            value: /v1/chat/completions
        - path:
            type: PathPrefix
            value: /v1/completions
        - path:
            type: PathPrefix
            value: /v1/embeddings
        - path:
            type: PathPrefix
            value: /v1/rerank
        - path:
            type: PathPrefix
            value: /v1/image/generations
        - path:
            type: PathPrefix
            value: /v1/video/generations
      backendRefs:
        - name: aibrix-gateway-plugins
          port: 50052
---
apiVersion: gateway.envoyproxy.io/v1alpha1
kind: EnvoyExtensionPolicy
metadata:
  name: gateway-plugins-extension-policy
  namespace: system
spec:
  targetRef:
    group: gateway.networking.k8s.io
    kind: HTTPRoute
    name: aibrix-reserved-router
  extProc:
    - backendRefs:
        - name: aibrix-gateway-plugins
          port: 50052
      processingMode:
        request:
          body: Buffered
        response:
          body: Streamed
      messageTimeout: 60s