Skip to content

Commit 20fc3bf

Browse files
authored
e2e: add provider switch (upstream/azure), bump keda-kaito-scaler to v0.5.1, instrument timings (#50)
1 parent cf22930 commit 20fc3bf

12 files changed

Lines changed: 502 additions & 49 deletions

File tree

.github/actions/e2e-base-setup/action.yaml

Lines changed: 27 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -21,7 +21,7 @@ inputs:
2121
location:
2222
description: 'Azure region for the cluster.'
2323
required: false
24-
default: 'swedencentral'
24+
default: 'australiaeast'
2525
node-count:
2626
description: 'Number of AKS nodes.'
2727
required: false
@@ -30,6 +30,10 @@ inputs:
3030
description: 'AKS node VM size.'
3131
required: false
3232
default: 'Standard_D8s_v5'
33+
provider:
34+
description: 'E2E provider (upstream|azure) — master switch for component install mix. Empty defaults to upstream.'
35+
required: false
36+
default: ''
3337
# Component version overrides — empty string means "use versions.env default".
3438
istio-version:
3539
description: 'Istio version override.'
@@ -70,21 +74,41 @@ runs:
7074
# Source defaults from the centralized version file (loaded exactly once).
7175
source versions.env
7276
# Inputs override defaults when non-empty.
77+
INPUT_PROVIDER="${{ inputs.provider }}"
7378
INPUT_ISTIO="${{ inputs.istio-version }}"
7479
INPUT_GWAPI="${{ inputs.gateway-api-version }}"
7580
INPUT_BBR="${{ inputs.bbr-version }}"
7681
INPUT_KEDA="${{ inputs.keda-version }}"
7782
INPUT_KKS="${{ inputs.keda-kaito-scaler-version }}"
7883
INPUT_LGA="${{ inputs.llm-gateway-auth-version }}"
84+
[ -n "${INPUT_PROVIDER}" ] && E2E_PROVIDER="${INPUT_PROVIDER}"
7985
[ -n "${INPUT_ISTIO}" ] && ISTIO_VERSION="${INPUT_ISTIO}"
8086
[ -n "${INPUT_GWAPI}" ] && GATEWAY_API_VERSION="${INPUT_GWAPI}"
8187
[ -n "${INPUT_BBR}" ] && BBR_VERSION="${INPUT_BBR}"
8288
[ -n "${INPUT_KEDA}" ] && KEDA_VERSION="${INPUT_KEDA}"
8389
[ -n "${INPUT_KKS}" ] && KEDA_KAITO_SCALER_VERSION="${INPUT_KKS}"
8490
[ -n "${INPUT_LGA}" ] && LLM_GATEWAY_AUTH_VERSION="${INPUT_LGA}"
91+
92+
# Validate provider value.
93+
case "${E2E_PROVIDER}" in
94+
upstream|azure) ;;
95+
*)
96+
echo "Invalid E2E_PROVIDER='${E2E_PROVIDER}'. Must be 'upstream' or 'azure'." >&2
97+
exit 1
98+
;;
99+
esac
100+
101+
# Derive KEDA install namespace from provider.
102+
case "${E2E_PROVIDER}" in
103+
upstream) KEDA_NAMESPACE="keda" ;;
104+
azure) KEDA_NAMESPACE="kube-system" ;;
105+
esac
106+
85107
# Export to GITHUB_ENV so all subsequent steps inherit these.
86108
{
87109
echo "GO_VERSION=${GO_VERSION}"
110+
echo "E2E_PROVIDER=${E2E_PROVIDER}"
111+
echo "KEDA_NAMESPACE=${KEDA_NAMESPACE}"
88112
echo "ISTIO_VERSION=${ISTIO_VERSION}"
89113
echo "GATEWAY_API_VERSION=${GATEWAY_API_VERSION}"
90114
echo "BBR_VERSION=${BBR_VERSION}"
@@ -98,6 +122,8 @@ runs:
98122
- name: Print versions
99123
shell: bash
100124
run: |
125+
echo "E2E_PROVIDER=${E2E_PROVIDER}"
126+
echo "KEDA_NAMESPACE=${KEDA_NAMESPACE}"
101127
echo "GO_VERSION=${GO_VERSION}"
102128
echo "ISTIO_VERSION=${ISTIO_VERSION}"
103129
echo "GATEWAY_API_VERSION=${GATEWAY_API_VERSION}"

.github/workflows/benchmark.yaml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -37,7 +37,7 @@ env:
3737
CLUSTER_NAME: "kaito-gw-bench-aks-${{ github.run_id }}"
3838
ACR_NAME: "kaitogwbenchaks${{ github.run_id }}acr"
3939
GPU_MOCKER_IMAGE: "gpu-node-mocker:latest-${{ github.run_id }}"
40-
LOCATION: swedencentral
40+
LOCATION: australiaeast
4141
NODE_COUNT: '3'
4242
NODE_VM_SIZE: Standard_D8s_v5
4343
# Dedicated workload namespace provisioned by charts/modelharness.

.github/workflows/e2e-nightly.yaml

Lines changed: 31 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -19,6 +19,14 @@ on:
1919
description: 'Branch name to run E2E tests against'
2020
required: true
2121
default: 'main'
22+
provider:
23+
description: 'E2E provider — "upstream" or "azure" (Azure-managed KEDA add-on).'
24+
required: false
25+
default: 'upstream'
26+
type: choice
27+
options:
28+
- upstream
29+
- azure
2230
istio_version:
2331
description: 'Istio version (e.g. 1.29.0). Leave empty to use default from versions.env'
2432
required: false
@@ -45,7 +53,7 @@ env:
4553
CLUSTER_NAME: "kaito-gw-e2e-nightly-aks-${{ github.run_id }}"
4654
ACR_NAME: "kaitogwe2enightly${{ github.run_id }}acr"
4755
GPU_MOCKER_IMAGE: "gpu-node-mocker:latest-${{ github.run_id }}"
48-
LOCATION: swedencentral
56+
LOCATION: australiaeast
4957
NODE_COUNT: '3'
5058
NODE_VM_SIZE: Standard_D8s_v5
5159

@@ -70,19 +78,37 @@ jobs:
7078
# Source defaults from the centralized version file (loaded exactly once).
7179
source versions.env
7280
# workflow_dispatch inputs override defaults when non-empty.
81+
INPUT_PROVIDER="${{ github.event.inputs.provider }}"
7382
INPUT_ISTIO="${{ github.event.inputs.istio_version }}"
7483
INPUT_GWAPI="${{ github.event.inputs.gateway_api_version }}"
7584
INPUT_BBR="${{ github.event.inputs.bbr_version }}"
7685
INPUT_KEDA="${{ github.event.inputs.keda_version }}"
7786
INPUT_KKS="${{ github.event.inputs.keda_kaito_scaler_version }}"
87+
[ -n "${INPUT_PROVIDER}" ] && E2E_PROVIDER="${INPUT_PROVIDER}"
7888
[ -n "${INPUT_ISTIO}" ] && ISTIO_VERSION="${INPUT_ISTIO}"
7989
[ -n "${INPUT_GWAPI}" ] && GATEWAY_API_VERSION="${INPUT_GWAPI}"
8090
[ -n "${INPUT_BBR}" ] && BBR_VERSION="${INPUT_BBR}"
8191
[ -n "${INPUT_KEDA}" ] && KEDA_VERSION="${INPUT_KEDA}"
8292
[ -n "${INPUT_KKS}" ] && KEDA_KAITO_SCALER_VERSION="${INPUT_KKS}"
93+
94+
# Validate provider value.
95+
case "${E2E_PROVIDER}" in
96+
upstream|azure) ;;
97+
*)
98+
echo "Invalid E2E_PROVIDER='${E2E_PROVIDER}'. Must be 'upstream' or 'azure'." >&2
99+
exit 1
100+
;;
101+
esac
102+
case "${E2E_PROVIDER}" in
103+
upstream) KEDA_NAMESPACE="keda" ;;
104+
azure) KEDA_NAMESPACE="kube-system" ;;
105+
esac
106+
83107
# Export to GITHUB_ENV so all subsequent steps inherit these.
84108
{
85109
echo "GO_VERSION=${GO_VERSION}"
110+
echo "E2E_PROVIDER=${E2E_PROVIDER}"
111+
echo "KEDA_NAMESPACE=${KEDA_NAMESPACE}"
86112
echo "ISTIO_VERSION=${ISTIO_VERSION}"
87113
echo "GATEWAY_API_VERSION=${GATEWAY_API_VERSION}"
88114
echo "BBR_VERSION=${BBR_VERSION}"
@@ -93,6 +119,8 @@ jobs:
93119
94120
- name: Print versions
95121
run: |
122+
echo "E2E_PROVIDER=${E2E_PROVIDER}"
123+
echo "KEDA_NAMESPACE=${KEDA_NAMESPACE}"
96124
echo "GO_VERSION=${GO_VERSION}"
97125
echo "ISTIO_VERSION=${ISTIO_VERSION}"
98126
echo "GATEWAY_API_VERSION=${GATEWAY_API_VERSION}"
@@ -143,9 +171,10 @@ jobs:
143171

144172
- name: Run E2E tests (Nightly specs only)
145173
# Restrict ginkgo to specs tagged with the Nightly label.
174+
# NetworkPolicy specs are temporarily disabled.
146175
run: make test-e2e
147176
env:
148-
E2E_LABEL: "Nightly"
177+
E2E_LABEL: "Nightly && !NetworkPolicy"
149178

150179
- name: Dump cluster state
151180
if: failure()

.github/workflows/e2e.yaml

Lines changed: 12 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -11,6 +11,14 @@ on:
1111
description: 'Branch name to run E2E tests against'
1212
required: true
1313
default: 'main'
14+
provider:
15+
description: 'E2E provider — selects component install mix. "upstream" = Helm-installed components; "azure" = AKS managed KEDA add-on, others stay upstream.'
16+
required: false
17+
default: 'upstream'
18+
type: choice
19+
options:
20+
- upstream
21+
- azure
1422
istio_version:
1523
description: 'Istio version (e.g. 1.29.0). Leave empty to use default from versions.env'
1624
required: false
@@ -41,7 +49,7 @@ env:
4149
CLUSTER_NAME: "kaito-gw-e2e-aks-${{ github.run_id }}"
4250
ACR_NAME: "kaitogwe2eaks${{ github.run_id }}acr"
4351
GPU_MOCKER_IMAGE: "gpu-node-mocker:latest-${{ github.run_id }}"
44-
LOCATION: swedencentral
52+
LOCATION: australiaeast
4553
NODE_COUNT: '3'
4654
NODE_VM_SIZE: Standard_D8s_v5
4755

@@ -71,6 +79,7 @@ jobs:
7179
location: ${{ env.LOCATION }}
7280
node-count: ${{ env.NODE_COUNT }}
7381
node-vm-size: ${{ env.NODE_VM_SIZE }}
82+
provider: ${{ github.event.inputs.provider }}
7483
istio-version: ${{ github.event.inputs.istio_version }}
7584
gateway-api-version: ${{ github.event.inputs.gateway_api_version }}
7685
bbr-version: ${{ github.event.inputs.bbr_version }}
@@ -81,9 +90,10 @@ jobs:
8190
- name: Run E2E tests
8291
# Skip Nightly-labeled specs (slow/destructive scaling cases, etc.).
8392
# TheNightly suite runs in e2e-nightly.yaml.
93+
# NetworkPolicy specs are temporarily disabled.
8494
run: make test-e2e
8595
env:
86-
E2E_LABEL: "!Nightly"
96+
E2E_LABEL: "!Nightly && !NetworkPolicy"
8797

8898
- name: Dump cluster state
8999
if: failure()

Makefile

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -134,6 +134,12 @@ test-e2e: ## Run e2e tests against a live cluster (requires KUBECONFIG).
134134
## Component versions are centralized in versions.env (repo root).
135135
## Override any version via environment variables, e.g.:
136136
## ISTIO_VERSION=1.30.0 BBR_VERSION=v1.4.0 make e2e-install
137+
##
138+
## The E2E_PROVIDER master switch (default: upstream) selects how
139+
## infrastructure components are sourced:
140+
## upstream → install everything via Helm/upstream manifests
141+
## azure → enable the AKS managed KEDA add-on at cluster create
142+
## time and skip the standalone Helm KEDA install
137143
## --------------------------------------
138144

139145
.PHONY: e2e

cmd/gpu-node-mocker/main.go

Lines changed: 60 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -18,6 +18,7 @@ package main
1818

1919
import (
2020
"flag"
21+
"fmt"
2122
"os"
2223

2324
coordinationv1 "k8s.io/api/coordination/v1"
@@ -26,7 +27,9 @@ import (
2627
"k8s.io/apimachinery/pkg/runtime"
2728
"k8s.io/apimachinery/pkg/runtime/schema"
2829
utilruntime "k8s.io/apimachinery/pkg/util/runtime"
30+
"k8s.io/client-go/discovery"
2931
clientgoscheme "k8s.io/client-go/kubernetes/scheme"
32+
"k8s.io/client-go/rest"
3033
ctrl "sigs.k8s.io/controller-runtime"
3134
"sigs.k8s.io/controller-runtime/pkg/healthz"
3235
"sigs.k8s.io/controller-runtime/pkg/log/zap"
@@ -82,7 +85,22 @@ func main() {
8285

8386
ctrl.SetLogger(zap.New(zap.UseFlagOptions(&opts)))
8487

85-
mgr, err := ctrl.NewManager(ctrl.GetConfigOrDie(), ctrl.Options{
88+
restCfg := ctrl.GetConfigOrDie()
89+
90+
// Fail fast if required CRDs are not yet installed in the cluster. The
91+
// gpu-node-mocker controllers watch karpenter.sh/v1 NodeClaim objects;
92+
// without the CRD, controller-runtime's informers loop on "no kind is
93+
// registered" errors instead of failing. Exiting with a non-zero status
94+
// here lets the Deployment's restart policy back off and retry until
95+
// the KAITO operator (which ships the karpenter CRDs) finishes
96+
// installing them. This unblocks parallel install ordering at the
97+
// shell level (no need to gate on KAITO CRDs before deploying us).
98+
if err := checkRequiredCRDs(restCfg); err != nil {
99+
setupLog.Error(err, "required CRDs are not ready; exiting so the pod is restarted")
100+
os.Exit(1)
101+
}
102+
103+
mgr, err := ctrl.NewManager(restCfg, ctrl.Options{
86104
Scheme: scheme,
87105
Metrics: metricsserver.Options{BindAddress: metricsAddr},
88106
HealthProbeBindAddress: probeAddr,
@@ -120,3 +138,44 @@ func main() {
120138
os.Exit(1)
121139
}
122140
}
141+
142+
// checkRequiredCRDs verifies that every API resource the gpu-node-mocker
143+
// controllers depend on is already registered with the API server. The
144+
// check is done via discovery so it does not require the CRD types to be
145+
// served — only that the apiserver advertises the resource. A single
146+
// missing resource returns an error; the caller is expected to exit so
147+
// the kubelet restarts the pod (the simplest "wait for CRDs" strategy).
148+
func checkRequiredCRDs(cfg *rest.Config) error {
149+
dc, err := discovery.NewDiscoveryClientForConfig(cfg)
150+
if err != nil {
151+
return err
152+
}
153+
154+
required := []struct {
155+
groupVersion string
156+
resource string
157+
}{
158+
// Karpenter NodeClaim CRD is installed by the KAITO workspace
159+
// operator's chart; the NodeClaimReconciler watches it.
160+
{groupVersion: "karpenter.sh/v1", resource: "nodeclaims"},
161+
}
162+
163+
for _, r := range required {
164+
list, err := dc.ServerResourcesForGroupVersion(r.groupVersion)
165+
if err != nil {
166+
return fmt.Errorf("discovering resources for %s: %w", r.groupVersion, err)
167+
}
168+
found := false
169+
for _, api := range list.APIResources {
170+
if api.Name == r.resource {
171+
found = true
172+
break
173+
}
174+
}
175+
if !found {
176+
return fmt.Errorf("required resource %s.%s is not yet registered with the apiserver", r.resource, r.groupVersion)
177+
}
178+
setupLog.Info("required CRD is ready", "groupVersion", r.groupVersion, "resource", r.resource)
179+
}
180+
return nil
181+
}

0 commit comments

Comments
 (0)