🌱 Add WVA CKS nightly workflow + remove duplicate nightly (llm-d#756)

clubanderson · claude · web-flow · commit 762cb9a9f6e2 · 2026-02-18T12:05:47.000-03:00
Add CKS-specific nightly E2E caller workflow for WVA on waldorf
and remove duplicate nightly-e2e-openshift.yaml that was replaced
by the reusable workflow pattern.

Signed-off-by: Andy Anderson &lt;andy@clubanderson.com&gt;
Signed-off-by: Andrew Anderson &lt;andy@clubanderson.com&gt;
Co-authored-by: Claude Opus 4.5 &lt;noreply@anthropic.com&gt;
diff --git a/.github/workflows/nightly-e2e-cks.yaml b/.github/workflows/nightly-e2e-cks.yaml
@@ -1,22 +1,22 @@
-name: Nightly - OpenShift E2E Tests
+name: Nightly - CKS E2E Tests
 
-# Nightly regression test for WVA on OpenShift.
-# Calls the reusable workflow from llm-d/llm-d-infra to deploy the
-# workload-autoscaling guide stack and run the e2e test suite.
+# Nightly regression test for WVA on CoreWeave Kubernetes (CKS).
+# Calls the reusable CKS helmfile workflow from llm-d/llm-d-infra to deploy
+# the workload-autoscaling guide stack and run the e2e test suite on waldorf.
 
 on:
   schedule:
-    - cron: '0 0 * * *'  # Midnight UTC daily
+    - cron: '30 6 * * *'  # 06:30 UTC daily (staggered from IS CKS at 06:00)
   workflow_dispatch:
     inputs:
       model_id:
         description: 'Model ID'
         required: false
         default: 'unsloth/Meta-Llama-3.1-8B'
       accelerator_type:
-        description: 'Accelerator type (H100, A100, L40S)'
+        description: 'Accelerator type (H100, H200, A100)'
         required: false
-        default: 'A100'
+        default: 'H100'
       image_tag:
         description: 'WVA image tag — "latest" auto-resolves to newest release'
         required: false
@@ -46,20 +46,22 @@ permissions:
   contents: read
 
 concurrency:
-  group: nightly-e2e-openshift
+  group: nightly-e2e-cks-wva
   cancel-in-progress: true
 
 jobs:
   nightly:
-    uses: llm-d/llm-d-infra/.github/workflows/reusable-nightly-e2e-openshift.yaml@main
+    uses: llm-d/llm-d-infra/.github/workflows/reusable-nightly-e2e-cks-helmfile.yaml@main
     with:
       guide_name: workload-autoscaling
-      namespace_suffix: nightly-wva
+      namespace: llm-d-nightly-wva-cks
+      helmfile_env: istio
+      gateway_type: istio
       caller_repo: ${{ github.repository }}
       caller_ref: ${{ github.ref_name }}
       deploy_wva: true
       model_id: ${{ github.event.inputs.model_id || 'unsloth/Meta-Llama-3.1-8B' }}
-      accelerator_type: ${{ github.event.inputs.accelerator_type || 'A100' }}
+      accelerator_type: ${{ github.event.inputs.accelerator_type || 'H100' }}
       wva_image_tag: ${{ github.event.inputs.image_tag || 'latest' }}
       request_rate: ${{ github.event.inputs.request_rate || '20' }}
       num_prompts: ${{ github.event.inputs.num_prompts || '3000' }}
@@ -68,5 +70,9 @@ jobs:
       skip_cleanup: ${{ github.event.inputs.skip_cleanup == 'true' }}
       required_gpus: 2
       recommended_gpus: 4
+      allow_gpu_preemption: true
+      pod_wait_timeout: '30m'
+      pod_readiness_delay: 180
+      image_override: 'ghcr.io/llm-d/llm-d-cuda-dev:latest'
       test_target: test-e2e-openshift
     secrets: inherit