fix(dynamo): reject non-vllm mocker at admission and fix review nits

surajssd · surajssd · commit 98583cee6a5f · 2026-06-01T17:11:19.000-07:00
Address Copilot review feedback on the Dynamo mocker E2E PR:

- Enforce the vLLM-only constraint for mocker mode at admission time in the
  validating webhook, so a non-vllm `engine.type` combined with the
  `airunway.ai/dynamo-test-backend=mocker` annotation is rejected up front
  instead of being admitted and failing later during provider reconcile. Add a
  webhook test covering the `sglang`-on-`dynamo` rejection.
- Fix stale comments in `buildAggregatedWorker`, `buildPrefillWorker`, and
  `buildDecodeWorker`: mocker mode does not drop all GPU/CPU requests, it
  replaces GPU resources with small CPU/memory requests+limits to keep the
  worker Burstable rather than BestEffort.
- Rename the local `yaml` variable in `testCreateMockerModelDeployment` to
  `manifest` to stop it shadowing the `sigs.k8s.io/yaml` package import.
- Drop the `-run TestDynamoMocker` filter from the `test-e2e-mocker` target so
  `TestInjectMockerAnnotation` also runs in CI; the GPU lane self-skips without
  `DYNAMO_INSTALLED`.

Signed-off-by: Suraj Deshmukh &lt;suraj.deshmukh@microsoft.com&gt;
diff --git a/controller/internal/webhook/v1alpha1/modeldeployment_webhook.go b/controller/internal/webhook/v1alpha1/modeldeployment_webhook.go
@@ -303,6 +303,19 @@ func (v *ModelDeploymentCustomValidator) validateSpec(ctx context.Context, obj *
 	isDynamoMocker := obj.Annotations["airunway.ai/dynamo-test-backend"] == "mocker" &&
 		spec.Provider != nil && spec.Provider.Name == "dynamo"
 
+	// The Dynamo mocker backend only simulates the vLLM engine. Enforce the
+	// vLLM-only constraint at admission so a non-vllm engine + mocker annotation
+	// is rejected here rather than admitted and failing later during provider
+	// reconciliation (the dynamo provider re-validates this too). An empty engine
+	// type is allowed — the provider defaults it to vllm.
+	if isDynamoMocker && spec.Engine.Type != "" && spec.Engine.Type != airunwayv1alpha1.EngineTypeVLLM {
+		allErrs = append(allErrs, field.Invalid(
+			specPath.Child("engine", "type"),
+			spec.Engine.Type,
+			"the dynamo mocker test backend only supports the vllm engine",
+		))
+	}
+
 	if !isDynamoMocker && spec.Provider != nil && spec.Provider.Name != "" && spec.Engine.Type != "" && v.Reader != nil {
 		var providerConfig airunwayv1alpha1.InferenceProviderConfig
 		err := v.Reader.Get(ctx, client.ObjectKey{Name: spec.Provider.Name}, &providerConfig)
diff --git a/controller/internal/webhook/v1alpha1/modeldeployment_webhook_test.go b/controller/internal/webhook/v1alpha1/modeldeployment_webhook_test.go
@@ -1446,5 +1446,16 @@ var _ = Describe("ModelDeployment Webhook", func() {
 			Expect(err).To(HaveOccurred())
 			Expect(err.Error()).To(ContainSubstring("gpu.count > 0"))
 		})
+
+		It("Should reject a non-vllm engine on dynamo even with the mocker annotation", func() {
+			obj.Annotations = map[string]string{"airunway.ai/dynamo-test-backend": "mocker"}
+			obj.Spec.Model.ID = "Qwen/Qwen3-0.6B"
+			obj.Spec.Engine.Type = airunwayv1alpha1.EngineTypeSGLang
+			obj.Spec.Provider = &airunwayv1alpha1.ProviderSpec{Name: "dynamo"}
+			obj.Spec.Resources = &airunwayv1alpha1.ResourceSpec{GPU: &airunwayv1alpha1.GPUSpec{Count: 0}}
+			_, err := validator.ValidateCreate(ctx, obj)
+			Expect(err).To(HaveOccurred())
+			Expect(err.Error()).To(ContainSubstring("only supports the vllm engine"))
+		})
 	})
 })
diff --git a/providers/dynamo/Makefile b/providers/dynamo/Makefile
@@ -108,5 +108,7 @@ test-e2e:
 
 ## Run the CPU-only mocker e2e tests (aggregated + disaggregated). Requires a cluster
 ## with the Dynamo platform (see setup-dynamo-mocker) and the dynamo provider deployed.
+## No -run filter: the GPU lane (TestDynamoProviderE2E) self-skips without DYNAMO_INSTALLED,
+## so dropping it lets the unit-style TestInjectMockerAnnotation run here too.
 test-e2e-mocker:
-	DYNAMO_MOCKER=true go test -count=1 -tags=e2e -v -timeout 30m ./test/e2e/ -run TestDynamoMocker
+	DYNAMO_MOCKER=true go test -count=1 -tags=e2e -v -timeout 30m ./test/e2e/
diff --git a/providers/dynamo/test/e2e/dynamo_mocker_e2e_test.go b/providers/dynamo/test/e2e/dynamo_mocker_e2e_test.go
@@ -152,8 +152,8 @@ func testCreateMockerModelDeployment(t *testing.T, tc mockerCase) {
 		t.Fatalf("failed to read fixture %s: %v", path, err)
 	}
 
-	yaml := injectMockerAnnotation(t, string(raw))
-	out, err := kubectlApplyLiteral(t, yaml)
+	manifest := injectMockerAnnotation(t, string(raw))
+	out, err := kubectlApplyLiteral(t, manifest)
 	if err != nil {
 		t.Fatalf("failed to apply mocker ModelDeployment: %v\nOutput: %s", err, out)
 	}
diff --git a/providers/dynamo/transformer.go b/providers/dynamo/transformer.go
@@ -512,8 +512,9 @@ func (t *Transformer) buildAggregatedWorker(md *airunwayv1alpha1.ModelDeployment
 
 	command := t.engineCommand(md.ResolvedEngineType())
 
-	// Mocker mode: swap the real engine for python3 -m dynamo.mocker and drop
-	// all GPU/CPU resource requests so the worker schedules on CPU-only nodes.
+	// Mocker mode: swap the real engine for python3 -m dynamo.mocker and replace
+	// the GPU resources with small CPU/memory requests+limits (no GPU) so the
+	// worker schedules on CPU-only nodes while staying Burstable, not BestEffort.
 	if isMockerMode(md) {
 		command = mockerCommand()
 		args = buildMockerArgs(md)
@@ -610,9 +611,10 @@ func (t *Transformer) buildPrefillWorker(md *airunwayv1alpha1.ModelDeployment, i
 
 	command := t.engineCommand(md.ResolvedEngineType())
 
-	// Mocker mode: swap the real engine for python3 -m dynamo.mocker and drop
-	// GPU resource requests. The mocker keeps --disaggregation-mode but does
-	// NOT use --kv-transfer-config (that NIXL flag is real-vLLM-only).
+	// Mocker mode: swap the real engine for python3 -m dynamo.mocker and replace
+	// the GPU resources with small CPU/memory requests+limits (no GPU). The mocker
+	// keeps --disaggregation-mode but does NOT use --kv-transfer-config (that NIXL
+	// flag is real-vLLM-only).
 	if isMockerMode(md) {
 		command = mockerCommand()
 		args = append(buildMockerArgs(md), "--disaggregation-mode", SubComponentTypePrefill)
@@ -686,9 +688,10 @@ func (t *Transformer) buildDecodeWorker(md *airunwayv1alpha1.ModelDeployment, im
 
 	command := t.engineCommand(md.ResolvedEngineType())
 
-	// Mocker mode: swap the real engine for python3 -m dynamo.mocker and drop
-	// GPU resource requests. The mocker keeps --disaggregation-mode but does
-	// NOT use --kv-transfer-config (that NIXL flag is real-vLLM-only).
+	// Mocker mode: swap the real engine for python3 -m dynamo.mocker and replace
+	// the GPU resources with small CPU/memory requests+limits (no GPU). The mocker
+	// keeps --disaggregation-mode but does NOT use --kv-transfer-config (that NIXL
+	// flag is real-vLLM-only).
 	if isMockerMode(md) {
 		command = mockerCommand()
 		args = append(buildMockerArgs(md), "--disaggregation-mode", SubComponentTypeDecode)

Original file line number	Diff line number	Diff line change
`@@ -152,8 +152,8 @@ func testCreateMockerModelDeployment(t *testing.T, tc mockerCase) {`
`152`	`152`	`t.Fatalf("failed to read fixture %s: %v", path, err)`
`153`	`153`	`}`
`154`	`154`
`155`		`- yaml := injectMockerAnnotation(t, string(raw))`
`156`		`- out, err := kubectlApplyLiteral(t, yaml)`
	`155`	`+ manifest := injectMockerAnnotation(t, string(raw))`
	`156`	`+ out, err := kubectlApplyLiteral(t, manifest)`
`157`	`157`	`if err != nil {`
`158`	`158`	`t.Fatalf("failed to apply mocker ModelDeployment: %v\nOutput: %s", err, out)`
`159`	`159`	`}`