Add decode hheavy mode for single model benchmark (#1027)

asm582 · web-flow · commit 934737c3fe22 · 2026-04-20T16:38:23.000-04:00
* add decode hheavy mode for single model

* address review
diff --git a/Makefile b/Makefile
@@ -25,6 +25,14 @@ SCALE_TO_ZERO_ENABLED       ?= false
 SCALER_BACKEND              ?= prometheus-adapter  # prometheus-adapter (HPA), keda (ScaledObject), or none (skip, use pre-installed backend)
 E2E_MONITORING_NAMESPACE    ?= workload-variant-autoscaler-monitoring
 E2E_EMULATED_LLMD_NAMESPACE ?= llm-d-sim
+BENCHMARK_SCENARIO          ?= prefill_heavy  # Options: prefill_heavy (phase3a), decode_heavy (decode-heavy)
+
+# Map scenario name to Ginkgo label filter
+ifeq ($(BENCHMARK_SCENARIO),decode_heavy)
+  BENCHMARK_LABEL_FILTER := decode-heavy
+else
+  BENCHMARK_LABEL_FILTER := phase3a
+endif
 
 # Flags for deploy/install.sh installation script
 # Full e2e / CI-style cluster infra (WVA + llm-d, no chart VA/HPA): prefer `make deploy-e2e-infra`
@@ -297,6 +305,7 @@ test-multi-model-scaling: manifests generate fmt vet ## Run multi-model scaling
 	MM_MIN_REPLICAS=$(MM_MIN_REPLICAS) \
 	MM_MAX_REPLICAS=$(MM_MAX_REPLICAS) \
 	GATEWAY_SERVICE_NAME=multi-model-inference-gateway-istio \
+	BENCHMARK_SCENARIO=$(BENCHMARK_SCENARIO) \
 	PROMETHEUS_TOKEN=$$(oc whoami -t 2>/dev/null || echo "") \
 	go test ./test/benchmark/ -timeout 75m -v -ginkgo.v \
 		-ginkgo.label-filter="multi-model"; \
@@ -368,8 +377,8 @@ test-e2e-full-with-setup: deploy-e2e-infra test-e2e-full
 
 # Benchmark targets
 .PHONY: test-benchmark
-test-benchmark: manifests generate fmt vet ## Run benchmark tests (scale-up-latency scenario)
-	@echo "Running benchmark tests..."
+test-benchmark: manifests generate fmt vet ## Run benchmark tests. Use BENCHMARK_SCENARIO=decode_heavy for decode-heavy workload.
+	@echo "Running benchmark tests (scenario=$(BENCHMARK_SCENARIO), label=$(BENCHMARK_LABEL_FILTER))..."
 	KUBECONFIG=$(KUBECONFIG) \
 	ENVIRONMENT=$(ENVIRONMENT) \
 	WVA_NAMESPACE=$(CONTROLLER_NAMESPACE) \
@@ -378,9 +387,10 @@ test-benchmark: manifests generate fmt vet ## Run benchmark tests (scale-up-late
 	USE_SIMULATOR=$(USE_SIMULATOR) \
 	SCALER_BACKEND=$(SCALER_BACKEND) \
 	MODEL_ID=$(MODEL_ID) \
+	BENCHMARK_SCENARIO=$(BENCHMARK_SCENARIO) \
 	PROMETHEUS_TOKEN=$$(oc whoami -t 2>/dev/null || echo "") \
 	go test ./test/benchmark/ -timeout 75m -v -ginkgo.v \
-		-ginkgo.label-filter="phase3a"; \
+		-ginkgo.label-filter="$(BENCHMARK_LABEL_FILTER)"; \
 	TEST_EXIT_CODE=$$?; \
 	echo ""; \
 	echo "=========================================="; \
diff --git a/test/benchmark/multi_model_benchmark_test.go b/test/benchmark/multi_model_benchmark_test.go
@@ -239,6 +239,11 @@ var _ = Describe("Multi-Model Scaling Benchmark", Ordered, Label("benchmark", "m
 	// launchLoadJobs creates a GuideLLM job for each model targeting the shared Gateway.
 	launchLoadJobs := func() {
 		By("Launching GuideLLM load jobs for each model")
+		scenarioName := testconfig.GetEnv("BENCHMARK_SCENARIO", "prefill_heavy")
+		scenario := LoadScenario(scenarioName)
+		GinkgoWriter.Printf("  Scenario: %s (prompt=%d, output=%d, rate=%d)\n",
+			scenario.Name, scenario.PromptTokens, scenario.OutputTokens, scenario.Rate)
+
 		gatewayName := testconfig.GetEnv("GATEWAY_SERVICE_NAME", "multi-model-inference-gateway-istio")
 		gwHost := fmt.Sprintf("%s.%s.svc.cluster.local", gatewayName, benchCfg.LLMDNamespace)
 
@@ -248,7 +253,7 @@ var _ = Describe("Multi-Model Scaling Benchmark", Ordered, Label("benchmark", "m
 
 			err := CreateGuideLLMJobWithArgs(
 				testCtx, k8sClient, benchCfg.LLMDNamespace,
-				m.JobName, targetURL, m.ModelID,
+				m.JobName, targetURL, m.ModelID, scenario,
 			)
 			Expect(err).NotTo(HaveOccurred(), "Failed to create load job "+m.JobName)
 		}
diff --git a/test/benchmark/prefill_heavy_benchmark_test.go b/test/benchmark/prefill_heavy_benchmark_test.go
@@ -75,7 +75,7 @@ var prefillResults []PrefillResult
 
 const prefillResultsFile = "/tmp/prefill-benchmark-results.json"
 
-var _ = Describe("Prefill Heavy Workload Benchmark", Ordered, Label("benchmark", "phase3a"), func() {
+var _ = Describe("Scaling Benchmark", Ordered, Label("benchmark"), func() {
 	var (
 		ctx    context.Context
 		cancel context.CancelFunc
@@ -439,7 +439,7 @@ var _ = Describe("Prefill Heavy Workload Benchmark", Ordered, Label("benchmark",
 		}
 	}
 
-	runPrefillBenchmark := func(autoscalerType string) {
+	runBenchmarkScenario := func(autoscalerType string, scenarioName string) {
 		ensureEPPConfig()
 		ensureInfraDeploymentReady()
 		verifyEPPConfig()
@@ -487,9 +487,13 @@ var _ = Describe("Prefill Heavy Workload Benchmark", Ordered, Label("benchmark",
 
 		By("Launching GuideLLM Load Generator")
 
+		scenario := LoadScenario(scenarioName)
+		GinkgoWriter.Printf("  Scenario: %s (prompt=%d, output=%d, rate=%d)\n",
+			scenario.Name, scenario.PromptTokens, scenario.OutputTokens, scenario.Rate)
+
 		err = CreateGuideLLMJobWithArgs(
 			ctx, k8sClient, benchCfg.LLMDNamespace, res.ModelService,
-			targetURL, benchCfg.ModelID,
+			targetURL, benchCfg.ModelID, scenario,
 		)
 		Expect(err).NotTo(HaveOccurred(), "Failed to create GuideLLM load job")
 
@@ -806,7 +810,7 @@ var _ = Describe("Prefill Heavy Workload Benchmark", Ordered, Label("benchmark",
 		}
 
 		GinkgoWriter.Printf("\n  ┌────────────────────────────────────────────────────────────\n")
-		GinkgoWriter.Printf("  │ %s PREFILL BENCHMARK RESULTS\n", autoscalerType)
+		GinkgoWriter.Printf("  │ %s %s BENCHMARK RESULTS\n", autoscalerType, strings.ToUpper(scenario.Name))
 		GinkgoWriter.Printf("  │ Model: %s\n", benchCfg.ModelID)
 		GinkgoWriter.Printf("  ├────────────────────────────────────────────────────────────\n")
 		GinkgoWriter.Printf("  │ Duration:        %.0fs\n", loadDuration)
@@ -841,7 +845,7 @@ var _ = Describe("Prefill Heavy Workload Benchmark", Ordered, Label("benchmark",
 		_ = os.WriteFile(prefillResultsFile, data, 0644)
 	}
 
-	Context("WVA", func() {
+	Context("WVA Prefill Heavy", Label("phase3a"), func() {
 		It("should run the prefill heavy workload against WVA", func() {
 			cleanupAutoscalers()
 			res.DeploymentName = findInfraDecodeDeployment()
@@ -872,7 +876,42 @@ var _ = Describe("Prefill Heavy Workload Benchmark", Ordered, Label("benchmark",
 
 			waitForVAAndMetrics()
 
-			runPrefillBenchmark("WVA")
+			runBenchmarkScenario("WVA", "prefill_heavy")
+		})
+	})
+
+	Context("WVA Decode Heavy", Label("decode-heavy"), func() {
+		It("should run the decode heavy workload against WVA", func() {
+			cleanupAutoscalers()
+			res.DeploymentName = findInfraDecodeDeployment()
+			ensureInfraDeploymentReady()
+
+			By("Creating VariantAutoscaling resource (max=10, cost=10)")
+			err := fixtures.EnsureVariantAutoscaling(
+				ctx, crClient, benchCfg.LLMDNamespace, res.VAName, res.DeploymentName,
+				benchCfg.ModelID, benchCfg.AcceleratorType, 10.0, benchCfg.ControllerInstance,
+				fixtures.WithMinReplicas(1),
+				fixtures.WithMaxReplicas(10),
+			)
+			Expect(err).NotTo(HaveOccurred(), "Failed to create VA")
+
+			By("Creating HPA (Scale Up: 0s/Pods/10/150, Scale Down: 240s/Pods/10/150)")
+			behavior := &autoscalingv2.HorizontalPodAutoscalerBehavior{
+				ScaleUp: &autoscalingv2.HPAScalingRules{
+					StabilizationWindowSeconds: ptr.To(int32(0)),
+					Policies:                   []autoscalingv2.HPAScalingPolicy{{Type: autoscalingv2.PodsScalingPolicy, Value: 10, PeriodSeconds: 150}},
+				},
+				ScaleDown: &autoscalingv2.HPAScalingRules{
+					StabilizationWindowSeconds: ptr.To(int32(240)),
+					Policies:                   []autoscalingv2.HPAScalingPolicy{{Type: autoscalingv2.PodsScalingPolicy, Value: 10, PeriodSeconds: 150}},
+				},
+			}
+			err = fixtures.EnsureHPA(ctx, k8sClient, benchCfg.LLMDNamespace, res.HPAName, res.DeploymentName, res.VAName, 1, 10, WithBehavior(behavior))
+			Expect(err).NotTo(HaveOccurred(), "Failed to create HPA")
+
+			waitForVAAndMetrics()
+
+			runBenchmarkScenario("WVA", "decode_heavy")
 		})
 	})
 
diff --git a/test/benchmark/scenarios/decode_heavy.yaml b/test/benchmark/scenarios/decode_heavy.yaml
@@ -0,0 +1,8 @@
+name: "Decode Heavy"
+description: "Stress-tests decode (token generation) with short input, long output"
+promptTokens: 1000
+outputTokens: 4000
+rate: 20
+maxSeconds: 600
+profile: "poisson"
+requestType: "text_completions"
diff --git a/test/benchmark/scenarios/prefill_heavy.yaml b/test/benchmark/scenarios/prefill_heavy.yaml
@@ -0,0 +1,8 @@
+name: "Prefill Heavy"
+description: "Stress-tests prefill (prompt processing) with long input, short output"
+promptTokens: 4000
+outputTokens: 1000
+rate: 20
+maxSeconds: 600
+profile: "poisson"
+requestType: "text_completions"
diff --git a/test/benchmark/workload.go b/test/benchmark/workload.go
@@ -3,6 +3,10 @@ package benchmark
 import (
 	"context"
 	"fmt"
+	"os"
+	"path/filepath"
+	"runtime"
+	"strconv"
 	"strings"
 
 	batchv1 "k8s.io/api/batch/v1"
@@ -11,26 +15,85 @@ import (
 	metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
 	"k8s.io/client-go/kubernetes"
 	"k8s.io/utils/ptr"
+	"sigs.k8s.io/yaml"
 )
 
-// CreateGuideLLMJobWithArgs launches a GuideLLM Job with the specified arguments.
+// WorkloadScenario defines the GuideLLM workload parameters loaded from scenarios/ YAML files.
+type WorkloadScenario struct {
+	Name         string `json:"name" yaml:"name"`
+	Description  string `json:"description,omitempty" yaml:"description,omitempty"`
+	PromptTokens int    `json:"promptTokens" yaml:"promptTokens"`
+	OutputTokens int    `json:"outputTokens" yaml:"outputTokens"`
+	Rate         int    `json:"rate" yaml:"rate"`
+	MaxSeconds   int    `json:"maxSeconds" yaml:"maxSeconds"`
+	Profile      string `json:"profile" yaml:"profile"`
+	RequestType  string `json:"requestType" yaml:"requestType"`
+}
+
+// scenariosDir returns the absolute path to the scenarios/ directory relative to this source file.
+func scenariosDir() string {
+	_, thisFile, _, _ := runtime.Caller(0)
+	return filepath.Join(filepath.Dir(thisFile), "scenarios")
+}
+
+// defaultScenario returns the fallback prefill_heavy defaults used when no
+// scenario file is found or when YAML parsing fails.
+func defaultScenario() WorkloadScenario {
+	return WorkloadScenario{
+		Name:         "Prefill Heavy (default)",
+		PromptTokens: 4000,
+		OutputTokens: 1000,
+		Rate:         20,
+		MaxSeconds:   600,
+		Profile:      "poisson",
+		RequestType:  "text_completions",
+	}
+}
+
+// LoadScenario loads a WorkloadScenario from test/benchmark/scenarios/<name>.yaml.
+// If the named file doesn't exist, it falls back to prefill_heavy defaults.
+func LoadScenario(name string) WorkloadScenario {
+	if name == "" {
+		name = "prefill_heavy"
+	}
+
+	path := filepath.Join(scenariosDir(), name+".yaml")
+	data, err := os.ReadFile(path)
+	if err != nil {
+		// Fallback to prefill_heavy defaults (preserves backward compatibility)
+		return defaultScenario()
+	}
+
+	var scenario WorkloadScenario
+	if parseErr := yaml.Unmarshal(data, &scenario); parseErr != nil {
+		// On parse error, return defaults
+		return defaultScenario()
+	}
+
+	return scenario
+}
+
+// CreateGuideLLMJobWithArgs launches a GuideLLM Job with parameters from the given WorkloadScenario.
 func CreateGuideLLMJobWithArgs(
 	ctx context.Context,
 	k8sClient *kubernetes.Clientset,
 	namespace, name, targetServiceURL, modelID string,
+	scenario WorkloadScenario,
 ) error {
 	image := "ghcr.io/vllm-project/guidellm:v0.5.4"
 
+	dataArg := "prompt_tokens=" + strconv.Itoa(scenario.PromptTokens) + ",output_tokens=" + strconv.Itoa(scenario.OutputTokens)
+
 	args := []string{
 		"benchmark",
 		"--target", targetServiceURL,
 		"--model", modelID,
-		"--profile", "poisson",
-		"--rate", "20",
-		"--max-seconds", "600",
+		"--profile", scenario.Profile,
+		"--rate", strconv.Itoa(scenario.Rate),
+		"--max-seconds", strconv.Itoa(scenario.MaxSeconds),
 		"--random-seed", "42",
-		"--request-type", "text_completions",
-		"--data", "prompt_tokens=4000,output_tokens=1000",
+		"--request-type", scenario.RequestType,
+		"--data", dataArg,
 		"--output-path", "/tmp/benchmarks.json",
 		"--backend-kwargs", `'{"validate_backend": false}'`,
 	}