Skip to content

Commit 934737c

Browse files
authored
Add decode hheavy mode for single model benchmark (#1027)
* add decode hheavy mode for single model * address review
1 parent c9a62f4 commit 934737c

6 files changed

Lines changed: 149 additions & 16 deletions

File tree

Makefile

Lines changed: 13 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -25,6 +25,14 @@ SCALE_TO_ZERO_ENABLED ?= false
2525
SCALER_BACKEND ?= prometheus-adapter # prometheus-adapter (HPA), keda (ScaledObject), or none (skip, use pre-installed backend)
2626
E2E_MONITORING_NAMESPACE ?= workload-variant-autoscaler-monitoring
2727
E2E_EMULATED_LLMD_NAMESPACE ?= llm-d-sim
28+
BENCHMARK_SCENARIO ?= prefill_heavy # Options: prefill_heavy (phase3a), decode_heavy (decode-heavy)
29+
30+
# Map scenario name to Ginkgo label filter
31+
ifeq ($(BENCHMARK_SCENARIO),decode_heavy)
32+
BENCHMARK_LABEL_FILTER := decode-heavy
33+
else
34+
BENCHMARK_LABEL_FILTER := phase3a
35+
endif
2836

2937
# Flags for deploy/install.sh installation script
3038
# Full e2e / CI-style cluster infra (WVA + llm-d, no chart VA/HPA): prefer `make deploy-e2e-infra`
@@ -297,6 +305,7 @@ test-multi-model-scaling: manifests generate fmt vet ## Run multi-model scaling
297305
MM_MIN_REPLICAS=$(MM_MIN_REPLICAS) \
298306
MM_MAX_REPLICAS=$(MM_MAX_REPLICAS) \
299307
GATEWAY_SERVICE_NAME=multi-model-inference-gateway-istio \
308+
BENCHMARK_SCENARIO=$(BENCHMARK_SCENARIO) \
300309
PROMETHEUS_TOKEN=$$(oc whoami -t 2>/dev/null || echo "") \
301310
go test ./test/benchmark/ -timeout 75m -v -ginkgo.v \
302311
-ginkgo.label-filter="multi-model"; \
@@ -368,8 +377,8 @@ test-e2e-full-with-setup: deploy-e2e-infra test-e2e-full
368377

369378
# Benchmark targets
370379
.PHONY: test-benchmark
371-
test-benchmark: manifests generate fmt vet ## Run benchmark tests (scale-up-latency scenario)
372-
@echo "Running benchmark tests..."
380+
test-benchmark: manifests generate fmt vet ## Run benchmark tests. Use BENCHMARK_SCENARIO=decode_heavy for decode-heavy workload.
381+
@echo "Running benchmark tests (scenario=$(BENCHMARK_SCENARIO), label=$(BENCHMARK_LABEL_FILTER))..."
373382
KUBECONFIG=$(KUBECONFIG) \
374383
ENVIRONMENT=$(ENVIRONMENT) \
375384
WVA_NAMESPACE=$(CONTROLLER_NAMESPACE) \
@@ -378,9 +387,10 @@ test-benchmark: manifests generate fmt vet ## Run benchmark tests (scale-up-late
378387
USE_SIMULATOR=$(USE_SIMULATOR) \
379388
SCALER_BACKEND=$(SCALER_BACKEND) \
380389
MODEL_ID=$(MODEL_ID) \
390+
BENCHMARK_SCENARIO=$(BENCHMARK_SCENARIO) \
381391
PROMETHEUS_TOKEN=$$(oc whoami -t 2>/dev/null || echo "") \
382392
go test ./test/benchmark/ -timeout 75m -v -ginkgo.v \
383-
-ginkgo.label-filter="phase3a"; \
393+
-ginkgo.label-filter="$(BENCHMARK_LABEL_FILTER)"; \
384394
TEST_EXIT_CODE=$$?; \
385395
echo ""; \
386396
echo "=========================================="; \

test/benchmark/multi_model_benchmark_test.go

Lines changed: 6 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -239,6 +239,11 @@ var _ = Describe("Multi-Model Scaling Benchmark", Ordered, Label("benchmark", "m
239239
// launchLoadJobs creates a GuideLLM job for each model targeting the shared Gateway.
240240
launchLoadJobs := func() {
241241
By("Launching GuideLLM load jobs for each model")
242+
scenarioName := testconfig.GetEnv("BENCHMARK_SCENARIO", "prefill_heavy")
243+
scenario := LoadScenario(scenarioName)
244+
GinkgoWriter.Printf(" Scenario: %s (prompt=%d, output=%d, rate=%d)\n",
245+
scenario.Name, scenario.PromptTokens, scenario.OutputTokens, scenario.Rate)
246+
242247
gatewayName := testconfig.GetEnv("GATEWAY_SERVICE_NAME", "multi-model-inference-gateway-istio")
243248
gwHost := fmt.Sprintf("%s.%s.svc.cluster.local", gatewayName, benchCfg.LLMDNamespace)
244249

@@ -248,7 +253,7 @@ var _ = Describe("Multi-Model Scaling Benchmark", Ordered, Label("benchmark", "m
248253

249254
err := CreateGuideLLMJobWithArgs(
250255
testCtx, k8sClient, benchCfg.LLMDNamespace,
251-
m.JobName, targetURL, m.ModelID,
256+
m.JobName, targetURL, m.ModelID, scenario,
252257
)
253258
Expect(err).NotTo(HaveOccurred(), "Failed to create load job "+m.JobName)
254259
}

test/benchmark/prefill_heavy_benchmark_test.go

Lines changed: 45 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -75,7 +75,7 @@ var prefillResults []PrefillResult
7575

7676
const prefillResultsFile = "/tmp/prefill-benchmark-results.json"
7777

78-
var _ = Describe("Prefill Heavy Workload Benchmark", Ordered, Label("benchmark", "phase3a"), func() {
78+
var _ = Describe("Scaling Benchmark", Ordered, Label("benchmark"), func() {
7979
var (
8080
ctx context.Context
8181
cancel context.CancelFunc
@@ -439,7 +439,7 @@ var _ = Describe("Prefill Heavy Workload Benchmark", Ordered, Label("benchmark",
439439
}
440440
}
441441

442-
runPrefillBenchmark := func(autoscalerType string) {
442+
runBenchmarkScenario := func(autoscalerType string, scenarioName string) {
443443
ensureEPPConfig()
444444
ensureInfraDeploymentReady()
445445
verifyEPPConfig()
@@ -487,9 +487,13 @@ var _ = Describe("Prefill Heavy Workload Benchmark", Ordered, Label("benchmark",
487487

488488
By("Launching GuideLLM Load Generator")
489489

490+
scenario := LoadScenario(scenarioName)
491+
GinkgoWriter.Printf(" Scenario: %s (prompt=%d, output=%d, rate=%d)\n",
492+
scenario.Name, scenario.PromptTokens, scenario.OutputTokens, scenario.Rate)
493+
490494
err = CreateGuideLLMJobWithArgs(
491495
ctx, k8sClient, benchCfg.LLMDNamespace, res.ModelService,
492-
targetURL, benchCfg.ModelID,
496+
targetURL, benchCfg.ModelID, scenario,
493497
)
494498
Expect(err).NotTo(HaveOccurred(), "Failed to create GuideLLM load job")
495499

@@ -806,7 +810,7 @@ var _ = Describe("Prefill Heavy Workload Benchmark", Ordered, Label("benchmark",
806810
}
807811

808812
GinkgoWriter.Printf("\n ┌────────────────────────────────────────────────────────────\n")
809-
GinkgoWriter.Printf(" │ %s PREFILL BENCHMARK RESULTS\n", autoscalerType)
813+
GinkgoWriter.Printf(" │ %s %s BENCHMARK RESULTS\n", autoscalerType, strings.ToUpper(scenario.Name))
810814
GinkgoWriter.Printf(" │ Model: %s\n", benchCfg.ModelID)
811815
GinkgoWriter.Printf(" ├────────────────────────────────────────────────────────────\n")
812816
GinkgoWriter.Printf(" │ Duration: %.0fs\n", loadDuration)
@@ -841,7 +845,7 @@ var _ = Describe("Prefill Heavy Workload Benchmark", Ordered, Label("benchmark",
841845
_ = os.WriteFile(prefillResultsFile, data, 0644)
842846
}
843847

844-
Context("WVA", func() {
848+
Context("WVA Prefill Heavy", Label("phase3a"), func() {
845849
It("should run the prefill heavy workload against WVA", func() {
846850
cleanupAutoscalers()
847851
res.DeploymentName = findInfraDecodeDeployment()
@@ -872,7 +876,42 @@ var _ = Describe("Prefill Heavy Workload Benchmark", Ordered, Label("benchmark",
872876

873877
waitForVAAndMetrics()
874878

875-
runPrefillBenchmark("WVA")
879+
runBenchmarkScenario("WVA", "prefill_heavy")
880+
})
881+
})
882+
883+
Context("WVA Decode Heavy", Label("decode-heavy"), func() {
884+
It("should run the decode heavy workload against WVA", func() {
885+
cleanupAutoscalers()
886+
res.DeploymentName = findInfraDecodeDeployment()
887+
ensureInfraDeploymentReady()
888+
889+
By("Creating VariantAutoscaling resource (max=10, cost=10)")
890+
err := fixtures.EnsureVariantAutoscaling(
891+
ctx, crClient, benchCfg.LLMDNamespace, res.VAName, res.DeploymentName,
892+
benchCfg.ModelID, benchCfg.AcceleratorType, 10.0, benchCfg.ControllerInstance,
893+
fixtures.WithMinReplicas(1),
894+
fixtures.WithMaxReplicas(10),
895+
)
896+
Expect(err).NotTo(HaveOccurred(), "Failed to create VA")
897+
898+
By("Creating HPA (Scale Up: 0s/Pods/10/150, Scale Down: 240s/Pods/10/150)")
899+
behavior := &autoscalingv2.HorizontalPodAutoscalerBehavior{
900+
ScaleUp: &autoscalingv2.HPAScalingRules{
901+
StabilizationWindowSeconds: ptr.To(int32(0)),
902+
Policies: []autoscalingv2.HPAScalingPolicy{{Type: autoscalingv2.PodsScalingPolicy, Value: 10, PeriodSeconds: 150}},
903+
},
904+
ScaleDown: &autoscalingv2.HPAScalingRules{
905+
StabilizationWindowSeconds: ptr.To(int32(240)),
906+
Policies: []autoscalingv2.HPAScalingPolicy{{Type: autoscalingv2.PodsScalingPolicy, Value: 10, PeriodSeconds: 150}},
907+
},
908+
}
909+
err = fixtures.EnsureHPA(ctx, k8sClient, benchCfg.LLMDNamespace, res.HPAName, res.DeploymentName, res.VAName, 1, 10, WithBehavior(behavior))
910+
Expect(err).NotTo(HaveOccurred(), "Failed to create HPA")
911+
912+
waitForVAAndMetrics()
913+
914+
runBenchmarkScenario("WVA", "decode_heavy")
876915
})
877916
})
878917

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,8 @@
1+
name: "Decode Heavy"
2+
description: "Stress-tests decode (token generation) with short input, long output"
3+
promptTokens: 1000
4+
outputTokens: 4000
5+
rate: 20
6+
maxSeconds: 600
7+
profile: "poisson"
8+
requestType: "text_completions"
Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,8 @@
1+
name: "Prefill Heavy"
2+
description: "Stress-tests prefill (prompt processing) with long input, short output"
3+
promptTokens: 4000
4+
outputTokens: 1000
5+
rate: 20
6+
maxSeconds: 600
7+
profile: "poisson"
8+
requestType: "text_completions"

test/benchmark/workload.go

Lines changed: 69 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -3,6 +3,10 @@ package benchmark
33
import (
44
"context"
55
"fmt"
6+
"os"
7+
"path/filepath"
8+
"runtime"
9+
"strconv"
610
"strings"
711

812
batchv1 "k8s.io/api/batch/v1"
@@ -11,26 +15,85 @@ import (
1115
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
1216
"k8s.io/client-go/kubernetes"
1317
"k8s.io/utils/ptr"
18+
"sigs.k8s.io/yaml"
1419
)
1520

16-
// CreateGuideLLMJobWithArgs launches a GuideLLM Job with the specified arguments.
21+
// WorkloadScenario defines the GuideLLM workload parameters loaded from scenarios/ YAML files.
22+
type WorkloadScenario struct {
23+
Name string `json:"name" yaml:"name"`
24+
Description string `json:"description,omitempty" yaml:"description,omitempty"`
25+
PromptTokens int `json:"promptTokens" yaml:"promptTokens"`
26+
OutputTokens int `json:"outputTokens" yaml:"outputTokens"`
27+
Rate int `json:"rate" yaml:"rate"`
28+
MaxSeconds int `json:"maxSeconds" yaml:"maxSeconds"`
29+
Profile string `json:"profile" yaml:"profile"`
30+
RequestType string `json:"requestType" yaml:"requestType"`
31+
}
32+
33+
// scenariosDir returns the absolute path to the scenarios/ directory relative to this source file.
34+
func scenariosDir() string {
35+
_, thisFile, _, _ := runtime.Caller(0)
36+
return filepath.Join(filepath.Dir(thisFile), "scenarios")
37+
}
38+
39+
// defaultScenario returns the fallback prefill_heavy defaults used when no
40+
// scenario file is found or when YAML parsing fails.
41+
func defaultScenario() WorkloadScenario {
42+
return WorkloadScenario{
43+
Name: "Prefill Heavy (default)",
44+
PromptTokens: 4000,
45+
OutputTokens: 1000,
46+
Rate: 20,
47+
MaxSeconds: 600,
48+
Profile: "poisson",
49+
RequestType: "text_completions",
50+
}
51+
}
52+
53+
// LoadScenario loads a WorkloadScenario from test/benchmark/scenarios/<name>.yaml.
54+
// If the named file doesn't exist, it falls back to prefill_heavy defaults.
55+
func LoadScenario(name string) WorkloadScenario {
56+
if name == "" {
57+
name = "prefill_heavy"
58+
}
59+
60+
path := filepath.Join(scenariosDir(), name+".yaml")
61+
data, err := os.ReadFile(path)
62+
if err != nil {
63+
// Fallback to prefill_heavy defaults (preserves backward compatibility)
64+
return defaultScenario()
65+
}
66+
67+
var scenario WorkloadScenario
68+
if parseErr := yaml.Unmarshal(data, &scenario); parseErr != nil {
69+
// On parse error, return defaults
70+
return defaultScenario()
71+
}
72+
73+
return scenario
74+
}
75+
76+
// CreateGuideLLMJobWithArgs launches a GuideLLM Job with parameters from the given WorkloadScenario.
1777
func CreateGuideLLMJobWithArgs(
1878
ctx context.Context,
1979
k8sClient *kubernetes.Clientset,
2080
namespace, name, targetServiceURL, modelID string,
81+
scenario WorkloadScenario,
2182
) error {
2283
image := "ghcr.io/vllm-project/guidellm:v0.5.4"
2384

85+
dataArg := "prompt_tokens=" + strconv.Itoa(scenario.PromptTokens) + ",output_tokens=" + strconv.Itoa(scenario.OutputTokens)
86+
2487
args := []string{
2588
"benchmark",
2689
"--target", targetServiceURL,
2790
"--model", modelID,
28-
"--profile", "poisson",
29-
"--rate", "20",
30-
"--max-seconds", "600",
91+
"--profile", scenario.Profile,
92+
"--rate", strconv.Itoa(scenario.Rate),
93+
"--max-seconds", strconv.Itoa(scenario.MaxSeconds),
3194
"--random-seed", "42",
32-
"--request-type", "text_completions",
33-
"--data", "prompt_tokens=4000,output_tokens=1000",
95+
"--request-type", scenario.RequestType,
96+
"--data", dataArg,
3497
"--output-path", "/tmp/benchmarks.json",
3598
"--backend-kwargs", `'{"validate_backend": false}'`,
3699
}

0 commit comments

Comments
 (0)