1- name : CI Helmfile Dry Run
1+ name : CI Guide Dry Run
22
33on :
44 pull_request :
55 branches : [main]
66 paths :
7- - guides/**/helmfile.yaml.gotmpl
8- - guides/**/values*.yaml
7+ - .github/workflows/ci-helmfile-dry-run.yaml
8+ - .github/scripts/**
9+ - guides/recipes/**
10+ - guides/optimized-baseline/**
11+ - guides/pd-disaggregation/**
12+ - guides/precise-prefix-cache-aware/**
13+ - guides/simulated-accelerators/**
14+ - guides/workload-autoscaling/**
915 - guides/prereq/**
10- - helpers/**
16+ - helpers/client-setup/ **
1117 - docs/monitoring/scripts/install-prometheus-grafana.sh
12- - .github/workflows/ci-helmfile -dry-run.yaml
18+ - .github/workflows/ci-kustomize -dry-run.yaml
1319 workflow_dispatch :
1420
1521permissions :
1925 detect-changes :
2026 runs-on : ubuntu-latest
2127 outputs :
22- inference-scheduling : ${{ steps.filter.outputs.inference-scheduling }}
28+ optimized-baseline : ${{ steps.filter.outputs.optimized-baseline }}
2329 pd-disaggregation : ${{ steps.filter.outputs.pd-disaggregation }}
2430 precise-prefix-cache-aware : ${{ steps.filter.outputs.precise-prefix-cache-aware }}
2531 simulated-accelerators : ${{ steps.filter.outputs.simulated-accelerators }}
@@ -30,14 +36,14 @@ jobs:
3036 id : filter
3137 with :
3238 filters : |
33- inference-scheduling :
39+ optimized-baseline :
3440 - .github/workflows/ci-helmfile-dry-run.yaml
35- - helpers/client-setup/install-deps.sh
36- - guides/prereq/gateway-provider/common-configurations/**
41+ - .github/scripts/install-gke-crds.sh
3742 - guides/prereq/gateway-provider/install-gateway-provider-dependencies.sh
38- - guides/prereq/gateway-provider/*.helmfile.yaml
3943 - docs/monitoring/scripts/install-prometheus-grafana.sh
40- - guides/inference-scheduling/**
44+ - guides/recipes/modelserver/**
45+ - guides/recipes/scheduler/**
46+ - guides/optimized-baseline/**
4147 pd-disaggregation:
4248 - .github/workflows/ci-helmfile-dry-run.yaml
4349 - helpers/client-setup/install-deps.sh
@@ -71,86 +77,97 @@ jobs:
7177 - docs/monitoring/scripts/install-prometheus-grafana.sh
7278 - guides/workload-autoscaling/**
7379
74- dry-run-inference-scheduling :
80+ dry-run-optimized-baseline :
7581 needs : detect-changes
76- if : needs.detect-changes.outputs.inference-scheduling == 'true'
82+ if : needs.detect-changes.outputs.optimized-baseline == 'true'
7783 runs-on : ubuntu-latest
7884 timeout-minutes : 20
7985 steps :
8086 - uses : actions/checkout@v6
8187 with :
8288 persist-credentials : false
8389
84- - name : Install client dependencies
85- run : helpers/client-setup/install-deps.sh
86-
8790 - name : Create kind cluster
8891 uses : helm/kind-action@ef37e7f390d99f746eb8b610417061a60e82a6cc # v1.14.0
8992 with :
90- cluster_name : helmfile -dry-run
93+ cluster_name : kustomize -dry-run
9194 node_image : kindest/node:v1.35.0 # 1.35+ required for stable DRA (resource.k8s.io/v1)
9295 version : v0.31.0
9396 wait : 120s
9497
9598 - name : Install Gateway API and GAIE CRDs
9699 run : guides/prereq/gateway-provider/install-gateway-provider-dependencies.sh > /dev/null
97100
98- - name : Install gateway providers
99- run : |
100- cd guides/prereq/gateway-provider
101- for f in *.helmfile.yaml; do
102- # Skip kgateway - it is deprecated and its older CRDs would
103- # overwrite the agentgateway installation.
104- [[ "$f" == "kgateway.helmfile.yaml" ]] && continue
105- echo "=== Installing gateway provider: $f ==="
106- helmfile apply -f "$f" > /dev/null
107- done
108-
109- - name : Wait for gateway control planes
110- run : |
111- for ns in istio-system agentgateway-system; do
112- if kubectl get namespace "$ns" &>/dev/null; then
113- echo "Waiting for pods in $ns..."
114- kubectl wait --for=condition=ready pod \
115- --selector=app.kubernetes.io/managed-by=Helm \
116- --namespace "$ns" \
117- --timeout=300s 2>/dev/null || true
118- fi
119- done
120-
121101 - name : Install monitoring CRDs
122102 run : docs/monitoring/scripts/install-prometheus-grafana.sh --crds-only
123103
104+ - name : Install LWS CRDs
105+ run : |
106+ kubectl apply --server-side -f \
107+ https://raw.githubusercontent.com/kubernetes-sigs/lws/v0.7.0/config/crd/bases/leaderworkerset.x-k8s.io_leaderworkersets.yaml
108+
124109 - name : Install GKE CRDs
125110 run : .github/scripts/install-gke-crds.sh
126111
127- - name : Dry-run all environments
112+ - name : Dry-run scheduler charts
128113 run : |
129- cd guides/inference-scheduling
130- # Extract only the YAML before the --- separator; the rest contains
131- # Go template syntax that yq cannot parse.
132- envs=$(awk '/^---/{exit} {print}' helmfile.yaml.gotmpl | yq -r '.environments | keys | .[]')
133- for env_name in $envs; do
114+ CHART_VERSION=v1.4.0
115+ failed=0
116+ for chart in standalone inferencepool; do
134117 echo ""
135118 echo "========================================"
136- echo "inference-scheduling: environment=$env_name "
119+ echo "optimized-baseline: scheduler (${chart}) "
137120 echo "========================================"
138- echo "--- helmfile template ---"
139- helmfile template -e "$env_name" > /tmp/rendered.yaml
140- echo "--- creating namespaces ---"
141- # Collect namespaces from helmfile release definitions and from the
142- # rendered manifests (some resources reference namespaces outside the
143- # release namespace, e.g. monitoring).
144- {
145- helmfile list -e "$env_name" 2>/dev/null | awk 'NR>1 {print $2}'
146- grep -E '^\s+namespace:\s' /tmp/rendered.yaml | awk '{print $2}' | tr -d '"'"'"
147- } | sort -u | while read -r ns; do
148- kubectl create namespace "$ns" --dry-run=client -o yaml | kubectl apply -f - 2>/dev/null || true
149- done
121+ echo "--- helm template ---"
122+ if ! helm template optimized-baseline-scheduler \
123+ "oci://registry.k8s.io/gateway-api-inference-extension/charts/${chart}" \
124+ -f guides/recipes/scheduler/base.values.yaml \
125+ -f guides/recipes/scheduler/features/monitoring.values.yaml \
126+ -f guides/optimized-baseline/scheduler/optimized-baseline.values.yaml \
127+ --version "${CHART_VERSION}" > /tmp/rendered.yaml; then
128+ echo "FAIL: helm template failed for ${chart}"
129+ failed=1
130+ continue
131+ fi
150132 echo "--- kubectl apply --dry-run=server ---"
151- kubectl apply --dry-run=server -f /tmp/rendered.yaml
152- echo "PASS: inference-scheduling / $env_name"
133+ if kubectl apply --dry-run=server -f /tmp/rendered.yaml; then
134+ echo "PASS: optimized-baseline / scheduler (${chart})"
135+ else
136+ echo "FAIL: optimized-baseline / scheduler (${chart})"
137+ failed=1
138+ fi
139+ done
140+ exit $failed
141+
142+ - name : Kustomize build and dry-run all model server overlays
143+ run : |
144+ failed=0
145+ for overlay_dir in guides/optimized-baseline/modelserver/*/; do
146+ # Each accelerator dir may have one or more server subdirs (vllm, sglang)
147+ for kust_dir in "${overlay_dir}"*/; do
148+ [ -f "${kust_dir}kustomization.yaml" ] || continue
149+ name="${kust_dir#guides/optimized-baseline/modelserver/}"
150+ name="${name%/}"
151+ echo ""
152+ echo "========================================"
153+ echo "optimized-baseline: ${name}"
154+ echo "========================================"
155+ echo "--- kustomize build ---"
156+ if ! kustomize build "${kust_dir}" > /tmp/rendered.yaml; then
157+ echo "FAIL: kustomize build failed for ${name}"
158+ failed=1
159+ continue
160+ fi
161+ echo "--- kubectl apply --dry-run=server ---"
162+ if kubectl apply --dry-run=server -f /tmp/rendered.yaml; then
163+ echo "PASS: optimized-baseline / ${name}"
164+ else
165+ echo "FAIL: optimized-baseline / ${name}"
166+ failed=1
167+ fi
168+ done
153169 done
170+ exit $failed
154171
155172 dry-run-pd-disaggregation :
156173 needs : detect-changes
0 commit comments