Skip to content

Commit af973b9

Browse files
sozercanCopilot
andauthored
feat: integrate Gateway API Inference Extension for unified inference routing (#73)
Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com>
1 parent 8be7150 commit af973b9

42 files changed

Lines changed: 3492 additions & 445 deletions

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

.github/workflows/e2e-gateway.yml

Lines changed: 373 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,373 @@
1+
name: E2E Gateway Tests
2+
3+
on:
4+
push:
5+
branches: [main]
6+
pull_request:
7+
branches: [main]
8+
workflow_dispatch:
9+
10+
jobs:
11+
e2e-gateway:
12+
runs-on: ubuntu-latest-16-cores
13+
timeout-minutes: 45
14+
15+
steps:
16+
- name: Checkout repository
17+
uses: actions/checkout@8e8c483db84b4bee98b60c0593521ed34d9990e8 # v4
18+
19+
- name: Setup Go
20+
uses: actions/setup-go@d35c59abb061a4a6fb18e82ac0862c26744d6ab5 # v5
21+
with:
22+
go-version: "1.25"
23+
cache-dependency-path: controller/go.sum
24+
25+
- name: Setup Kind
26+
run: |
27+
go install sigs.k8s.io/kind@latest
28+
kind create cluster --name kubeairunway-gw-e2e --wait 120s
29+
# Allow workloads on control plane node for LoadBalancer access
30+
kubectl label node kubeairunway-gw-e2e-control-plane node.kubernetes.io/exclude-from-external-load-balancers- 2>/dev/null || true
31+
32+
- name: Install cloud-provider-kind
33+
run: |
34+
go install sigs.k8s.io/cloud-provider-kind@latest
35+
cloud-provider-kind &
36+
sleep 5
37+
echo "✅ cloud-provider-kind running"
38+
39+
- name: Install Gateway API CRDs
40+
run: |
41+
kubectl apply -f https://github.com/kubernetes-sigs/gateway-api/releases/latest/download/standard-install.yaml
42+
43+
- name: Install Gateway API Inference Extension CRDs
44+
run: |
45+
kubectl apply -f https://github.com/kubernetes-sigs/gateway-api-inference-extension/releases/download/v1.3.1/manifests.yaml
46+
47+
- name: Install Istio with Inference Extension support
48+
run: |
49+
curl -L https://istio.io/downloadIstio | sh -
50+
cd istio-*/bin
51+
./istioctl install --set profile=minimal \
52+
--set values.pilot.env.ENABLE_GATEWAY_API_INFERENCE_EXTENSION=true -y
53+
kubectl wait --for=condition=Available deployment/istiod -n istio-system --timeout=120s
54+
echo "✅ Istio installed"
55+
56+
- name: Install KAITO operator
57+
run: |
58+
helm repo add kaito https://kaito-project.github.io/kaito/charts/kaito
59+
helm install kaito-workspace kaito/workspace \
60+
--namespace kaito-workspace \
61+
--create-namespace \
62+
--set featureGates.disableNodeAutoProvisioning=true
63+
kubectl wait --for=condition=Available deployment -n kaito-workspace -l app.kubernetes.io/name=workspace --timeout=120s
64+
65+
- name: Build and deploy controller
66+
run: |
67+
make controller-docker-build CONTROLLER_IMG=kubeairunway-controller:e2e
68+
kind load docker-image kubeairunway-controller:e2e --name kubeairunway-gw-e2e
69+
make controller-deploy CONTROLLER_IMG=kubeairunway-controller:e2e
70+
kubectl wait --for=condition=Available deployment -n kubeairunway-system -l control-plane=controller-manager --timeout=120s
71+
72+
- name: Build and deploy KAITO provider
73+
run: |
74+
make kaito-provider-docker-build KAITO_PROVIDER_IMG=kaito-provider:e2e
75+
kind load docker-image kaito-provider:e2e --name kubeairunway-gw-e2e
76+
make kaito-provider-deploy KAITO_PROVIDER_IMG=kaito-provider:e2e
77+
kubectl wait --for=condition=Available deployment -n kubeairunway-system -l control-plane=kaito-provider --timeout=120s
78+
79+
- name: Wait for provider registration
80+
run: |
81+
kubectl wait --for=jsonpath='{.status.ready}'=true inferenceproviderconfig/kaito --timeout=120s
82+
83+
- name: Create Gateway resource
84+
run: |
85+
kubectl apply -f controller/test/e2e/testdata/gateway.yaml
86+
echo "Waiting for Gateway to be programmed..."
87+
for i in $(seq 1 30); do
88+
PROGRAMMED=$(kubectl get gateway inference-gateway -o jsonpath='{.status.conditions[?(@.type=="Programmed")].status}' 2>/dev/null || echo "")
89+
if [ "$PROGRAMMED" = "True" ]; then
90+
echo "✅ Gateway is programmed"
91+
break
92+
fi
93+
echo "Attempt $i/30: programmed=$PROGRAMMED"
94+
if [ "$i" = "30" ]; then
95+
echo "⚠️ Gateway not programmed after 30 attempts, continuing anyway (Kind may not support LoadBalancer)"
96+
fi
97+
sleep 5
98+
done
99+
100+
- name: Create ModelDeployment with gateway enabled
101+
run: |
102+
kubectl apply -f controller/test/e2e/testdata/gateway-modeldeployment.yaml
103+
104+
- name: Wait for ModelDeployment to reach Running phase
105+
run: |
106+
kubectl wait --for=condition=WorkspaceSucceeded workspace/llama-gw-e2e -n default --timeout=600s 2>/dev/null || true
107+
108+
echo "Waiting for ModelDeployment to reach Running phase..."
109+
for i in $(seq 1 60); do
110+
PHASE=$(kubectl get modeldeployment llama-gw-e2e -o jsonpath='{.status.phase}' 2>/dev/null || echo "")
111+
echo "Attempt $i/60: phase=$PHASE"
112+
if [ "$PHASE" = "Running" ]; then
113+
echo "✅ ModelDeployment is Running"
114+
exit 0
115+
fi
116+
sleep 10
117+
done
118+
echo "❌ Timed out waiting for ModelDeployment to reach Running phase"
119+
exit 1
120+
121+
- name: Verify InferencePool created
122+
run: |
123+
echo "Waiting for InferencePool..."
124+
for i in $(seq 1 30); do
125+
if kubectl get inferencepool llama-gw-e2e -n default > /dev/null 2>&1; then
126+
echo "✅ InferencePool found"
127+
break
128+
fi
129+
echo "Attempt $i/30: InferencePool not found yet"
130+
if [ "$i" = "30" ]; then
131+
echo "❌ Timed out waiting for InferencePool"
132+
exit 1
133+
fi
134+
sleep 5
135+
done
136+
137+
# Verify selector label
138+
SELECTOR=$(kubectl get inferencepool llama-gw-e2e -n default \
139+
-o jsonpath='{.spec.selector.matchLabels.kubeairunway\.ai/model-deployment}')
140+
if [ "$SELECTOR" != "llama-gw-e2e" ]; then
141+
echo "❌ InferencePool selector mismatch: expected 'llama-gw-e2e', got '$SELECTOR'"
142+
exit 1
143+
fi
144+
echo "✅ InferencePool selector correct"
145+
146+
# Verify endpointPickerRef
147+
EPP_NAME=$(kubectl get inferencepool llama-gw-e2e -n default \
148+
-o jsonpath='{.spec.endpointPickerRef.name}')
149+
if [ -z "$EPP_NAME" ]; then
150+
echo "❌ InferencePool missing endpointPickerRef"
151+
exit 1
152+
fi
153+
echo "✅ InferencePool endpointPickerRef set: $EPP_NAME"
154+
155+
- name: Verify HTTPRoute created
156+
run: |
157+
echo "Waiting for HTTPRoute..."
158+
for i in $(seq 1 30); do
159+
if kubectl get httproute llama-gw-e2e -n default > /dev/null 2>&1; then
160+
echo "✅ HTTPRoute found"
161+
break
162+
fi
163+
echo "Attempt $i/30: HTTPRoute not found yet"
164+
if [ "$i" = "30" ]; then
165+
echo "❌ Timed out waiting for HTTPRoute"
166+
exit 1
167+
fi
168+
sleep 5
169+
done
170+
171+
# Verify parent ref points to gateway
172+
PARENT=$(kubectl get httproute llama-gw-e2e -n default \
173+
-o jsonpath='{.spec.parentRefs[0].name}')
174+
if [ "$PARENT" != "inference-gateway" ]; then
175+
echo "❌ HTTPRoute parent mismatch: expected 'inference-gateway', got '$PARENT'"
176+
exit 1
177+
fi
178+
echo "✅ HTTPRoute parent ref correct"
179+
180+
# Verify backend ref points to InferencePool
181+
BACKEND_GROUP=$(kubectl get httproute llama-gw-e2e -n default \
182+
-o jsonpath='{.spec.rules[0].backendRefs[0].group}')
183+
BACKEND_KIND=$(kubectl get httproute llama-gw-e2e -n default \
184+
-o jsonpath='{.spec.rules[0].backendRefs[0].kind}')
185+
if [ "$BACKEND_GROUP" != "inference.networking.k8s.io" ] || [ "$BACKEND_KIND" != "InferencePool" ]; then
186+
echo "❌ HTTPRoute backend ref mismatch: group=$BACKEND_GROUP kind=$BACKEND_KIND"
187+
exit 1
188+
fi
189+
echo "✅ HTTPRoute backend ref correct"
190+
191+
- name: Verify gateway status and model name auto-discovery
192+
run: |
193+
echo "Waiting for GatewayReady condition..."
194+
for i in $(seq 1 30); do
195+
GW_READY=$(kubectl get modeldeployment llama-gw-e2e -n default \
196+
-o jsonpath='{.status.conditions[?(@.type=="GatewayReady")].status}' 2>/dev/null || echo "")
197+
if [ "$GW_READY" = "True" ]; then
198+
echo "✅ GatewayReady condition is True"
199+
break
200+
fi
201+
echo "Attempt $i/30: GatewayReady=$GW_READY"
202+
if [ "$i" = "30" ]; then
203+
echo "❌ Timed out waiting for GatewayReady condition"
204+
exit 1
205+
fi
206+
sleep 5
207+
done
208+
209+
# Check auto-discovered model name
210+
MODEL_NAME=$(kubectl get modeldeployment llama-gw-e2e -n default \
211+
-o jsonpath='{.status.gateway.modelName}')
212+
if [ -z "$MODEL_NAME" ]; then
213+
echo "❌ Gateway model name is empty"
214+
exit 1
215+
fi
216+
echo "✅ Gateway model name auto-discovered: $MODEL_NAME"
217+
218+
- name: Wait for EPP to be ready
219+
run: |
220+
echo "Waiting for EPP deployment..."
221+
for i in $(seq 1 30); do
222+
READY=$(kubectl get deployment llama-gw-e2e-epp -n default -o jsonpath='{.status.readyReplicas}' 2>/dev/null || echo "0")
223+
if [ "$READY" = "1" ]; then
224+
echo "✅ EPP is ready"
225+
break
226+
fi
227+
echo "Attempt $i/30: EPP readyReplicas=$READY"
228+
if [ "$i" = "30" ]; then
229+
echo "❌ EPP not ready"
230+
exit 1
231+
fi
232+
sleep 10
233+
done
234+
235+
- name: Configure Istio DestinationRule for EPP
236+
run: |
237+
kubectl apply -f - <<'DREOF'
238+
apiVersion: networking.istio.io/v1beta1
239+
kind: DestinationRule
240+
metadata:
241+
name: llama-gw-e2e-epp
242+
namespace: default
243+
spec:
244+
host: llama-gw-e2e-epp.default.svc.cluster.local
245+
trafficPolicy:
246+
tls:
247+
mode: SIMPLE
248+
insecureSkipVerify: true
249+
DREOF
250+
echo "✅ Istio DestinationRule created for EPP"
251+
252+
- name: Install Body-Based Router (BBR)
253+
run: |
254+
helm install body-based-router \
255+
--set provider.name=istio \
256+
--version v1.3.1 \
257+
oci://registry.k8s.io/gateway-api-inference-extension/charts/body-based-routing \
258+
--wait --timeout 120s
259+
echo "✅ BBR installed"
260+
261+
- name: Test inference through gateway
262+
run: |
263+
MODEL_NAME=$(kubectl get modeldeployment llama-gw-e2e -n default \
264+
-o jsonpath='{.status.gateway.modelName}')
265+
echo "Model name: $MODEL_NAME"
266+
267+
# Get the Gateway LoadBalancer IP (provided by cloud-provider-kind)
268+
GW_IP=""
269+
for i in $(seq 1 30); do
270+
GW_IP=$(kubectl get gateway inference-gateway -o jsonpath='{.status.addresses[0].value}' 2>/dev/null || echo "")
271+
if [ -n "$GW_IP" ]; then
272+
echo "Gateway IP: $GW_IP"
273+
break
274+
fi
275+
echo "Waiting for Gateway IP... attempt $i/30"
276+
sleep 5
277+
done
278+
279+
if [ -z "$GW_IP" ]; then
280+
echo "❌ Gateway IP not assigned"
281+
exit 1
282+
fi
283+
284+
echo "Sending inference request through gateway at http://${GW_IP}..."
285+
for i in $(seq 1 18); do
286+
HTTP_CODE=$(curl -s -o /tmp/response.json -w '%{http_code}' --max-time 30 \
287+
http://${GW_IP}/v1/chat/completions \
288+
-H "Content-Type: application/json" \
289+
-d "{
290+
\"model\": \"$MODEL_NAME\",
291+
\"messages\": [{\"role\": \"user\", \"content\": \"Say hello in one word.\"}],
292+
\"max_tokens\": 10
293+
}" 2>&1 || true)
294+
RESPONSE=$(cat /tmp/response.json 2>/dev/null || echo "")
295+
296+
if [ "$HTTP_CODE" = "200" ] && echo "$RESPONSE" | jq -e '.choices[0].message.content' > /dev/null 2>&1; then
297+
echo "Response: $RESPONSE"
298+
echo "✅ Inference through gateway succeeded"
299+
exit 0
300+
fi
301+
echo "Attempt $i/18: HTTP=$HTTP_CODE body=$(echo $RESPONSE | head -c 200)"
302+
sleep 10
303+
done
304+
echo "❌ Inference through gateway failed"
305+
exit 1
306+
307+
- name: Test gateway disable and cleanup
308+
run: |
309+
# Disable gateway
310+
kubectl patch modeldeployment llama-gw-e2e -n default \
311+
--type=merge -p '{"spec":{"gateway":{"enabled":false}}}'
312+
313+
echo "Waiting for gateway resources to be cleaned up..."
314+
sleep 15
315+
316+
# Verify InferencePool deleted
317+
if kubectl get inferencepool llama-gw-e2e -n default 2>/dev/null; then
318+
echo "❌ InferencePool should have been deleted"
319+
exit 1
320+
fi
321+
echo "✅ InferencePool cleaned up"
322+
323+
# Verify HTTPRoute deleted
324+
if kubectl get httproute llama-gw-e2e -n default 2>/dev/null; then
325+
echo "❌ HTTPRoute should have been deleted"
326+
exit 1
327+
fi
328+
echo "✅ HTTPRoute cleaned up"
329+
330+
# Verify GatewayReady condition is False
331+
GW_READY=$(kubectl get modeldeployment llama-gw-e2e -n default \
332+
-o jsonpath='{.status.conditions[?(@.type=="GatewayReady")].status}')
333+
if [ "$GW_READY" != "False" ]; then
334+
echo "❌ GatewayReady condition should be False after disable: $GW_READY"
335+
exit 1
336+
fi
337+
echo "✅ GatewayReady condition is False after disable"
338+
339+
- name: Collect debug info
340+
if: failure()
341+
run: |
342+
echo "=== ModelDeployments ==="
343+
kubectl get modeldeployments -A -o yaml
344+
echo "=== InferencePools ==="
345+
kubectl get inferencepools -A -o yaml 2>/dev/null || echo "No InferencePools"
346+
echo "=== HTTPRoutes ==="
347+
kubectl get httproutes -A -o yaml 2>/dev/null || echo "No HTTPRoutes"
348+
echo "=== Gateways ==="
349+
kubectl get gateways -A -o yaml 2>/dev/null || echo "No Gateways"
350+
echo "=== Workspaces ==="
351+
kubectl get workspaces -A -o yaml
352+
echo "=== Controller Logs ==="
353+
kubectl logs -n kubeairunway-system -l control-plane=controller-manager --tail=200
354+
echo "=== KAITO Provider Logs ==="
355+
kubectl logs -n kubeairunway-system -l control-plane=kaito-provider --tail=100
356+
echo "=== EPP Logs ==="
357+
kubectl logs -n default -l app.kubernetes.io/name=llama-gw-e2e-epp --tail=100 2>/dev/null || echo "No EPP logs"
358+
echo "=== Istio Logs ==="
359+
kubectl logs -n istio-system -l app=istiod --tail=100 2>/dev/null || echo "No Istio logs"
360+
echo "=== Gateway Proxy Logs ==="
361+
GW_POD=$(kubectl get pods -n default -l gateway.networking.k8s.io/gateway-name=inference-gateway -o jsonpath='{.items[0].metadata.name}' 2>/dev/null || echo "")
362+
[ -n "$GW_POD" ] && kubectl logs "$GW_POD" -n default --tail=50 2>/dev/null || echo "No gateway proxy logs"
363+
echo "=== Gateway Pods ==="
364+
kubectl get pods -n default -l gateway.networking.k8s.io/gateway-name=inference-gateway -o yaml
365+
echo "=== Events ==="
366+
kubectl get events -A --sort-by=.lastTimestamp
367+
echo "=== Pods ==="
368+
kubectl get pods -A
369+
370+
- name: Cleanup
371+
if: always()
372+
run: |
373+
kind delete cluster --name kubeairunway-gw-e2e

Makefile

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -7,6 +7,9 @@
77
# Controller image
88
CONTROLLER_IMG ?= ghcr.io/kaito-project/kubeairunway-controller:latest
99

10+
# Gateway API Inference Extension version
11+
GAIE_VERSION ?= v1.3.1
12+
1013
# Provider images
1114
KAITO_PROVIDER_IMG ?= ghcr.io/kaito-project/kaito-provider:latest
1215
DYNAMO_PROVIDER_IMG ?= ghcr.io/kaito-project/dynamo-provider:latest

README.md

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -16,6 +16,7 @@ KubeAIRunway gives you a web UI and a unified Kubernetes CRD (`ModelDeployment`)
1616
- 🔧 **Multiple Engines**[vLLM](https://github.com/vllm-project/vllm), [SGLang](https://github.com/sgl-project/sglang), [TensorRT-LLM](https://github.com/NVIDIA/TensorRT-LLM), [llama.cpp](https://github.com/ggml-org/llama.cpp)
1717
- 📈 **Live Monitoring** — Real-time status, logs, and Prometheus metrics
1818
- 💰 **Cost Estimation** — GPU pricing and capacity guidance
19+
- 🌐 **Gateway API Integration** — Unified inference endpoint via [Gateway API Inference Extension](https://gateway-api.sigs.k8s.io/geps/gep-3567/) with auto-detected setup
1920
- 🔌 **Headlamp Plugin** — Full-featured [Headlamp](https://headlamp.dev/) dashboard plugin
2021

2122
## Supported Providers
@@ -97,6 +98,7 @@ The controller automatically selects the best engine and provider, creates provi
9798
| Observability | [docs/observability.md](docs/observability.md) |
9899
| Development | [docs/development.md](docs/development.md) |
99100
| Kubernetes Deployment | [deploy/kubernetes/README.md](deploy/kubernetes/README.md) |
101+
| Gateway Integration | [docs/gateway.md](docs/gateway.md) |
100102
| Headlamp Plugin | [docs/headlamp-plugin.md](docs/headlamp-plugin.md) |
101103
102104
## Contributing

0 commit comments

Comments
 (0)