opendatahub-io · zdtsw · Apr 3, 2026 · Apr 3, 2026 · Apr 3, 2026 · Apr 6, 2026
diff --git a/.github/workflows/ci-benchmark.yaml b/.github/workflows/ci-benchmark.yaml
diff --git a/.github/workflows/nightly-e2e-cks.yaml b/.github/workflows/nightly-e2e-cks.yaml
diff --git a/.gitignore b/.gitignore
@@ -35,3 +35,6 @@ llmd-infra/
 
 *.tgz
 actionlint
+
+# AI
+.claude
diff --git a/.golangci.yml b/.golangci.yml
@@ -0,0 +1,47 @@
+version: "2"
+run:
+  allow-parallel-runners: true
+linters:
+  default: none
+  enable:
+    - copyloopvar
+    - dupword
+    - durationcheck
+    - errcheck
+    - fatcontext
+    - ginkgolinter
+    - goconst
+    - gocritic
+    - govet
+    - ineffassign
+    - loggercheck
+    - makezero
+    - misspell
+    - nakedret
+    - perfsprint
+    - prealloc
+    - revive
+    - staticcheck
+    - unconvert
+    - unparam
+    - unused
+  settings:
+    revive:
+      rules:
+        - name: comment-spacings
+  exclusions:
+    generated: lax
+    presets:
+      - comments
+      - common-false-positives
+      - legacy
+      - std-error-handling
+    rules:
+      - linters: [staticcheck]
+        text: "SA1019:.*Accelerator.*is deprecated"
+    paths:
+      - bin
+formatters:
+  enable:
+    - gofmt
+    - goimports
diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md
@@ -56,7 +56,7 @@ The repository uses AI-powered workflows to automate repetitive tasks:
 - **Workflow Creation**: Interactive designer for new workflows
 - **Workflow Debugging**: Assists with troubleshooting
 
-Learn more in the [Agentic Workflows Guide](docs/developer-guide/agentic-workflows.md).
+Learn more in the [Developer Guide](docs/developer-guide/development.md).
 
 ## WVA Project Structure
 

diff --git a/Makefile b/Makefile
@@ -281,8 +281,8 @@ test-benchmark: manifests generate fmt vet ## Run benchmark tests (scale-up-late
 	USE_SIMULATOR=$(USE_SIMULATOR) \
 	SCALER_BACKEND=$(SCALER_BACKEND) \
 	MODEL_ID=$(MODEL_ID) \
-	go test ./test/benchmark/ -timeout 30m -v -ginkgo.v \
-		-ginkgo.label-filter="benchmark"; \
+	go test ./test/benchmark/ -timeout 75m -v -ginkgo.v \
+		-ginkgo.label-filter="phase3a"; \
 	TEST_EXIT_CODE=$$?; \
 	echo ""; \
 	echo "=========================================="; \
@@ -294,6 +294,24 @@ test-benchmark: manifests generate fmt vet ## Run benchmark tests (scale-up-late
 .PHONY: test-benchmark-with-setup
 test-benchmark-with-setup: deploy-e2e-infra test-benchmark
 
+# Stub for llm-d nightly reusable workflows (test_target=nightly-test-llm-d)
+# No-op; temporarily satisfies nightly CI make invocation
+# TODO: add nightly guide tests here
+.PHONY: nightly-test-llm-d
+nightly-test-llm-d: ## Nightly CI: noop; use as test_target instead of empty string
+	@:
+
+# Shared script: deploy/lib/llm_d_nightly_install.sh
+# Canonical target for llm-d-infra nightly reusables: ENVIRONMENT=openshift|kubernetes
+.PHONY: nightly-deploy-wva-guide
+nightly-deploy-wva-guide: ## Nightly: full WVA+llm-d stack from job env (WVA_NS <- WVA_NAMESPACE or CONTROLLER_NAMESPACE)
+	@export WVA_NS="$${WVA_NS:-$${WVA_NAMESPACE:-$${CONTROLLER_NAMESPACE:-}}}"; \
+	if [ "$${ENVIRONMENT:-}" = openshift ]; then \
+		LLM_D_NIGHTLY_PLATFORM=openshift bash "$(CURDIR)/deploy/lib/llm_d_nightly_install.sh" "$(CURDIR)"; \
+	else \
+		LLM_D_NIGHTLY_PLATFORM=cks bash "$(CURDIR)/deploy/lib/llm_d_nightly_install.sh" "$(CURDIR)"; \
+	fi
+
 .PHONY: lint
 lint: golangci-lint ## Run golangci-lint linter
 	$(GOLANGCI_LINT) run

diff --git a/README.md b/README.md
@@ -8,7 +8,7 @@ The Workload Variant Autoscaler (WVA) is a Kubernetes-based global autoscaler fo
 
 ### What is a variant?
 
-In WVA, a **variant** is a way of serving a given model: a scale target (Deployment, StatefulSet, or LWS) with a particular combination of hardware, runtimes, and serving approach. Variants for the same model share the same base model (e.g. meta/llama-3.1-8b); LoRA adapters can differ per variant. Each variant is a distinct setup—e.g. different accelerators (A100, H100, L4), parallelism, or performance requirements. Create one `VariantAutoscaling` per variant; when several variants serve the same model, WVA chooses which to scale (e.g. add capacity on the cheapest variant, remove it from the most expensive). See [Configuration](docs/user-guide/configuration.md) and [Saturation Analyzer](docs/saturation-analyzer.md) for details.
+In WVA, a **variant** is a way of serving a given model: a scale target (Deployment, StatefulSet, or LWS) with a particular combination of hardware, runtimes, and serving approach. Variants for the same model share the same base model (e.g. meta/llama-3.1-8b); LoRA adapters can differ per variant. Each variant is a distinct setup—e.g. different accelerators (A100, H100, L4), parallelism, or performance requirements. Create one `VariantAutoscaling` per variant; when several variants serve the same model, WVA chooses which to scale (e.g. add capacity on the cheapest variant, remove it from the most expensive). See [Configuration](docs/user-guide/configuration.md) and [Saturation Analyzer](docs/user-guide/saturation-analyzer.md) for details.
 
 <!--
 <![Architecture](docs/design/diagrams/inferno-WVA-design.png)>
@@ -29,16 +29,9 @@ In WVA, a **variant** is a way of serving a given model: a scale target (Deploym
 - [CRD Reference](docs/user-guide/crd-reference.md)
 - [Multi-Controller Isolation](docs/user-guide/multi-controller-isolation.md)
 
-<!-- 
-
-### Tutorials
-- [Quick Start Demo](docs/tutorials/demo.md)
-- [Parameter Estimation](docs/tutorials/parameter-estimation.md)
-- [vLLM Server Setup](docs/tutorials/vllm-samples.md)
--->
 ### Integrations
-- [HPA Integration](docs/integrations/hpa-integration.md)
-- [KEDA Integration](docs/integrations/keda-integration.md)
+- [HPA Integration](docs/user-guide/hpa-integration.md)
+- [KEDA Integration](docs/user-guide/keda-integration.md)
 - [Prometheus Metrics](docs/integrations/prometheus.md)
 
 <!-- 

diff --git a/api/v1alpha1/variantautoscaling_types.go b/api/v1alpha1/variantautoscaling_types.go
@@ -72,6 +72,7 @@ type OptimizedAlloc struct {
 	LastRunTime metav1.Time `json:"lastRunTime,omitempty"`
 
 	// Accelerator is the type of accelerator for the optimized allocation.
+	//
 	// Deprecated: This field is deprecated and will be removed in a future version. Use node selector or node affinity from scale target instead.
 	// +optional
 	Accelerator string `json:"accelerator,omitempty"`

diff --git a/charts/workload-variant-autoscaler/README.md b/charts/workload-variant-autoscaler/README.md
@@ -248,7 +248,7 @@ HPA_STABILIZATION_SECONDS=120 ./deploy/install.sh
 - **Development**: Use 30-60 seconds for faster iteration
 - **E2E Tests**: Use 30 seconds for rapid validation
 
-See [HPA Integration Guide](../../docs/integrations/hpa-integration.md) for detailed information.
+See [HPA Integration Guide](../../docs/user-guide/hpa-integration.md) for detailed information.
 
 ### Usage Examples
 

diff --git a/cmd/main.go b/cmd/main.go
@@ -19,6 +19,7 @@ package main
 import (
 	"context"
 	"crypto/tls"
+	"errors"
 	goflag "flag"
 	"fmt"
 	"net/http"
@@ -68,7 +69,7 @@ import (
 	inferencePoolV1 "sigs.k8s.io/gateway-api-inference-extension/api/v1"
 	inferencePoolV1alpha2 "sigs.k8s.io/gateway-api-inference-extension/apix/v1alpha2"
 	lwsv1 "sigs.k8s.io/lws/api/leaderworkerset/v1"
-	//+kubebuilder:scaffold:imports
+	// +kubebuilder:scaffold:imports
 )
 
 var (
@@ -82,7 +83,7 @@ func init() {
 	utilruntime.Must(inferencePoolV1.Install(scheme))
 	utilruntime.Must(inferencePoolV1alpha2.Install(scheme))
 	// Note: LeaderWorkerSet scheme is added conditionally in main() after checking if CRD exists
-	//+kubebuilder:scaffold:scheme
+	// +kubebuilder:scaffold:scheme
 }
 
 // checkLeaderWorkerSetCRD checks if the LeaderWorkerSet CRD is installed in the cluster
@@ -189,7 +190,8 @@ func main() {
 	cfg, err := config.Load(flag.CommandLine, *configFilePath)
 	if err != nil {
 		setupLog.Error(err, "failed to load configuration - this is a fatal error")
-		os.Exit(1)
+		logging.Sync() //nolint:errcheck
+		os.Exit(1)     //nolint:gocritic // exitAfterDefer: Sync() called explicitly above
 	}
 	setupLog.Info("Configuration loaded successfully")
 
@@ -544,7 +546,7 @@ func main() {
 		if syncErr != "" {
 			return fmt.Errorf("initial ConfigMap bootstrap not complete: %s", syncErr)
 		}
-		return fmt.Errorf("initial ConfigMap bootstrap not complete")
+		return errors.New("initial ConfigMap bootstrap not complete")
 	}); err != nil {
 		setupLog.Error(err, "unable to set up ready check")
 		os.Exit(1)

diff --git a/config/samples/hpa-integration.yaml → config/samples/hpa/hpa.yaml b/config/samples/hpa-integration.yaml → config/samples/hpa/hpa.yaml
diff --git a/config/samples/hpa/kustomization.yaml b/config/samples/hpa/kustomization.yaml
@@ -0,0 +1,7 @@
+apiVersion: kustomize.config.k8s.io/v1beta1
+kind: Kustomization
+metadata:
+  name: hpa-sample
+resources:
+- va.yaml
+- hpa.yaml
diff --git a/config/samples/hpa/va.yaml b/config/samples/hpa/va.yaml
@@ -0,0 +1,14 @@
+# Example VariantAutoscaling for HPA/KEDA integration.
+# Ensure a Deployment named sample-deployment exists in llm-d-sim (e.g. from kind-emulator or e2e).
+apiVersion: llmd.ai/v1alpha1
+kind: VariantAutoscaling
+metadata:
+  name: sample-deployment
+  namespace: llm-d-sim
+  labels:
+    inference.optimization/acceleratorName: A100
+spec:
+  scaleTargetRef:
+    kind: Deployment
+    name: sample-deployment
+  modelID: default/default
diff --git a/config/samples/keda/kustomization.yaml b/config/samples/keda/kustomization.yaml
@@ -0,0 +1,7 @@
+apiVersion: kustomize.config.k8s.io/v1beta1
+kind: Kustomization
+metadata:
+  name: keda-sample
+resources:
+- va.yaml
+- scaledobject.yaml
diff --git a/config/samples/keda-scaled-object.yaml → config/samples/keda/scaledobject.yaml b/config/samples/keda-scaled-object.yaml → config/samples/keda/scaledobject.yaml
diff --git a/config/samples/keda/va.yaml b/config/samples/keda/va.yaml
@@ -0,0 +1,14 @@
+# Example VariantAutoscaling for HPA/KEDA integration.
+# Ensure a Deployment named sample-deployment exists in llm-d-sim (e.g. from kind-emulator or e2e).
+apiVersion: llmd.ai/v1alpha1
+kind: VariantAutoscaling
+metadata:
+  name: sample-deployment
+  namespace: llm-d-sim
+  labels:
+    inference.optimization/acceleratorName: A100
+spec:
+  scaleTargetRef:
+    kind: Deployment
+    name: sample-deployment
+  modelID: default/default
diff --git a/deploy/README.md b/deploy/README.md
@@ -706,7 +706,7 @@ The `VLLM_MAX_NUM_SEQS` variable controls the maximum number of concurrent seque
 
 **Use cases:**
 - **E2E Testing**: Set to low values (e.g., `8` or `16`) to quickly trigger saturation and test autoscaling
-- **Parameter Estimation**: Match this to your desired maximum batch size (see [Parameter Estimation Guide](../docs/tutorials/parameter-estimation.md))
+- **Parameter Estimation**: Match this to your desired maximum batch size (see [Configuration Guide](../docs/user-guide/configuration.md))
 - **Production**: Leave unset to use vLLM's default based on available GPU memory
 
 **Example:**

diff --git a/deploy/install.sh b/deploy/install.sh
@@ -58,8 +58,7 @@ ITL_AVERAGE_LATENCY_MS=${ITL_AVERAGE_LATENCY_MS:-20}
 TTFT_AVERAGE_LATENCY_MS=${TTFT_AVERAGE_LATENCY_MS:-200}
 ENABLE_SCALE_TO_ZERO=${ENABLE_SCALE_TO_ZERO:-true}
 # llm-d-inference scheduler with image with flowcontrol support
-# TODO: update once the llm-d-inference-scheduler v0.5.0 is released
-LLM_D_INFERENCE_SCHEDULER_IMG=${LLM_D_INFERENCE_SCHEDULER_IMG:-"ghcr.io/llm-d/llm-d-inference-scheduler:v0.5.0-rc.1"}
+LLM_D_INFERENCE_SCHEDULER_IMG=${LLM_D_INFERENCE_SCHEDULER_IMG:-"ghcr.io/llm-d/llm-d-inference-scheduler:v0.7.0"}
 
 # Gateway Configuration
 GATEWAY_PROVIDER=${GATEWAY_PROVIDER:-"istio"} # Options: kgateway, istio
-Original file line number
+Diff line change
@@ Expand Up / @@ -35,3 +35,6 @@ llmd-infra/ @@
     *.tgz
     actionlint
+    # AI
+    .claude