Skip to content

Commit f02b1c9

Browse files
Antriksh JainCopilot
andcommitted
fix(azure.ai.agents): doctor checks 5-6 — 3-of-3 review fix-ups (skip-cascade + production-equivalent YAML validation)
Two MEDIUM findings, 3-of-3 reviewer consensus across Opus xhigh, Sonnet 4.6, and GPT-5.5. Finding A (skip-cascade propagation): priorFailed only matched StatusFail, not StatusSkip. When local.azure-yaml failed, its dependents (env-selected, agent-service-detected) skipped — but THEIR dependents (project-endpoint-set, agent-yaml-valid) saw the upstream as Skip-not-Fail and ran anyway. Concrete user impact in the "wrong directory" scenario: 1 real failure (check 2: azure.yaml) + 1 misleading failure (check 5: "Run azd provision") + 1 duplicate failure (check 6: same error as check 2). Worse: with a stale .env, check 5 could PASS with a stale endpoint and hide the broken project. Fix: rename priorFailed -> priorBlocked and treat both Fail and Skip as blocking. Updated skip messages on checks 5 and 6 to say "failed or skipped" / "or upstream check blocked" so users know the root cause is earlier in the chain. Added regression tests for the Skip-state predecessor path on both checks; updated TestPriorBlocked to cover the new contract (including the "matching skip" case that previously asserted false). Finding B (YAML validation gap): GPT identified that check 6 used bare yaml.Unmarshal into ContainerAgent, which is decode-permissive — silently accepts manifests with missing kind, invalid kind (e.g. "nonsense"), missing name, or DNS-invalid name (e.g. "My_Agent"). Production deploy uses agent_yaml.ValidateAgentDefinition which rejects all of these. Opus additionally identified that the doctor's "gopkg.in/yaml.v3" import was the wrong library entirely — agent_yaml's custom UnmarshalYAML methods (PropertySchema.UnmarshalYAML at yaml.go:374) bind to *go.yaml.in/yaml/v3. Node, not gopkg.in/yaml.v3.Node. Go method dispatch is by exact parameter type, so the doctor was silently skipping every custom unmarshaler in agent_yaml. Production loads via go.yaml.in (helpers.go:28,772). Sonnet confirmed both at HIGH (doctor's whole purpose is to surface deploy blockers pre-flight) and recommended the bundled fix. 3/3 bundling consensus: replace the bare unmarshal with a single call to agent_yaml.ValidateAgentDefinition(data). Opus verified the library transitively resolves — parse.go imports go.yaml.in/yaml/v3, so once the gopkg.in/yaml.v3 call is removed the file's wrong-library import goes with it (Go compile error otherwise). One function body changed. This also fixes Opus's wording caveat: REPLACE, do not AUGMENT — a naive "call Validate THEN keep the existing Unmarshal" would have preserved the library mismatch. Test changes: - Existing valid-YAML fixtures gain "kind: hosted" (required field). - 3 new failure tests: MissingKind, InvalidKind, InvalidName — locking in that doctor surfaces the same errors deploy would surface. Pre-flight: gofmt, vet, build, doctor 6.7s + full extension suite 24s on cmd / 9.5s on agent_api / etc — all green. golangci-lint 0 issues. cspell 0 issues. Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com>
1 parent ae4dd4b commit f02b1c9

2 files changed

Lines changed: 170 additions & 24 deletions

File tree

cli/azd/extensions/azure.ai.agents/internal/cmd/doctor/checks_project.go

Lines changed: 21 additions & 17 deletions
Original file line numberDiff line numberDiff line change
@@ -14,7 +14,6 @@ import (
1414
"azureaiagent/internal/pkg/agents/agent_yaml"
1515

1616
"github.com/azure/azure-dev/cli/azd/pkg/azdext"
17-
"gopkg.in/yaml.v3"
1817
)
1918

2019
// agentHost is the value used in azure.yaml for an azure.ai.agent service.
@@ -43,7 +42,7 @@ func newCheckAgentServiceDetected(deps Dependencies) Check {
4342
if deps.AzdClient == nil {
4443
return Result{Status: StatusSkip, Message: "skipped: azd extension not reachable"}
4544
}
46-
if priorFailed(prior, "local.azure-yaml") {
45+
if priorBlocked(prior, "local.azure-yaml") {
4746
return Result{Status: StatusSkip, Message: "skipped: azure.yaml check failed"}
4847
}
4948

@@ -115,8 +114,8 @@ func newCheckProjectEndpointSet(deps Dependencies) Check {
115114
if deps.AzdClient == nil {
116115
return Result{Status: StatusSkip, Message: "skipped: azd extension not reachable"}
117116
}
118-
if priorFailed(prior, "local.environment-selected") {
119-
return Result{Status: StatusSkip, Message: "skipped: environment check failed"}
117+
if priorBlocked(prior, "local.environment-selected") {
118+
return Result{Status: StatusSkip, Message: "skipped: environment check failed or skipped"}
120119
}
121120

122121
resp, err := deps.AzdClient.Environment().GetValue(ctx, &azdext.GetEnvRequest{
@@ -173,8 +172,8 @@ func newCheckAgentYAMLValid(deps Dependencies) Check {
173172
if deps.AzdClient == nil {
174173
return Result{Status: StatusSkip, Message: "skipped: azd extension not reachable"}
175174
}
176-
if priorFailed(prior, "local.agent-service-detected") {
177-
return Result{Status: StatusSkip, Message: "skipped: no agent services detected"}
175+
if priorBlocked(prior, "local.agent-service-detected") {
176+
return Result{Status: StatusSkip, Message: "skipped: no agent services detected or upstream check blocked"}
178177
}
179178

180179
resp, err := deps.AzdClient.Project().Get(ctx, &azdext.EmptyRequest{})
@@ -251,27 +250,32 @@ func newCheckAgentYAMLValid(deps Dependencies) Check {
251250
}
252251
}
253252

254-
// validateAgentYAML reads the file at path and ensures it parses as a
255-
// ContainerAgent. Returns the underlying read/parse error verbatim so
256-
// the caller can attribute it to the offending service.
253+
// validateAgentYAML reads the file at path and runs the same validation
254+
// (`agent_yaml.ValidateAgentDefinition`) that the deploy path uses, so a
255+
// PASS here implies the manifest will not be rejected by deploy for any
256+
// of: missing/invalid `kind`, missing/invalid `name`, or kind-specific
257+
// structural problems. Returns the underlying read/validate error
258+
// verbatim so the caller can attribute it to the offending service.
257259
func validateAgentYAML(path string) error {
258260
data, err := os.ReadFile(path) //nolint:gosec // G304: path is constructed from azd-resolved project root + service-relative path
259261
if err != nil {
260262
return fmt.Errorf("read %s: %w", path, err)
261263
}
262-
var parsed agent_yaml.ContainerAgent
263-
if err := yaml.Unmarshal(data, &parsed); err != nil {
264-
return fmt.Errorf("parse %s: %w", path, err)
264+
if err := agent_yaml.ValidateAgentDefinition(data); err != nil {
265+
return fmt.Errorf("validate %s: %w", path, err)
265266
}
266267
return nil
267268
}
268269

269-
// priorFailed reports whether the prior results contain a Fail entry
270-
// for the given check ID. Used for skip-cascade decisions across the
271-
// local-checks chain.
272-
func priorFailed(prior []Result, id string) bool {
270+
// priorBlocked reports whether the prior results contain a Fail or Skip
271+
// entry for the given check ID. Used for skip-cascade decisions across
272+
// the local-checks chain: when an upstream check is skipped (e.g.
273+
// because *its* upstream failed), downstream checks must also skip
274+
// rather than running on a broken-state assumption — otherwise users
275+
// see misleading remediation for the wrong root cause.
276+
func priorBlocked(prior []Result, id string) bool {
273277
for _, p := range prior {
274-
if p.ID == id && p.Status == StatusFail {
278+
if p.ID == id && (p.Status == StatusFail || p.Status == StatusSkip) {
275279
return true
276280
}
277281
}

cli/azd/extensions/azure.ai.agents/internal/cmd/doctor/checks_project_test.go

Lines changed: 149 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -179,6 +179,28 @@ func TestCheckProjectEndpointSet_PriorEnvFailed_Skips(t *testing.T) {
179179
require.Contains(t, got.Message, "environment check failed")
180180
}
181181

182+
func TestCheckProjectEndpointSet_PriorEnvSkipped_AlsoSkips(t *testing.T) {
183+
// Covers the cascade: azure-yaml fails -> environment-selected skips ->
184+
// project-endpoint-set must also skip. Without this propagation, check 5
185+
// would run against an unloaded env and surface misleading remediation
186+
// for the wrong root cause.
187+
t.Parallel()
188+
189+
client := newTestAzdClient(t,
190+
&fakeProjectServer{},
191+
// If the check incorrectly proceeds past the guard it would call
192+
// GetValue; set valueErr so we'd see the wrong-path Fail in the
193+
// assertion instead of a quiet Skip.
194+
&fakeEnvironmentServer{valueErr: errors.New("should not be called")})
195+
check := newCheckProjectEndpointSet(Dependencies{AzdClient: client})
196+
197+
prior := []Result{{ID: "local.environment-selected", Status: StatusSkip}}
198+
got := check.Fn(t.Context(), Options{}, prior)
199+
200+
require.Equal(t, StatusSkip, got.Status)
201+
require.Contains(t, got.Message, "environment check failed or skipped")
202+
}
203+
182204
func TestCheckProjectEndpointSet_GRPCError_Fails(t *testing.T) {
183205
t.Parallel()
184206

@@ -287,6 +309,26 @@ func TestCheckAgentYAMLValid_PriorAgentDetectionFailed_Skips(t *testing.T) {
287309
require.Contains(t, got.Message, "no agent services detected")
288310
}
289311

312+
func TestCheckAgentYAMLValid_PriorAgentDetectionSkipped_AlsoSkips(t *testing.T) {
313+
// Covers the cascade: azure-yaml fails -> agent-service-detected skips ->
314+
// agent-yaml-valid must also skip. Without this propagation, check 6
315+
// would re-fetch the project (failing identically to check 2) and
316+
// surface a duplicate failure for the same root cause.
317+
t.Parallel()
318+
319+
client := newTestAzdClient(t,
320+
// Server set up to fail if reached, to ensure the guard short-circuits.
321+
&fakeProjectServer{err: errors.New("should not be called")},
322+
&fakeEnvironmentServer{})
323+
check := newCheckAgentYAMLValid(Dependencies{AzdClient: client})
324+
325+
prior := []Result{{ID: "local.agent-service-detected", Status: StatusSkip}}
326+
got := check.Fn(t.Context(), Options{}, prior)
327+
328+
require.Equal(t, StatusSkip, got.Status)
329+
require.Contains(t, got.Message, "no agent services detected or upstream check blocked")
330+
}
331+
290332
func TestCheckAgentYAMLValid_GRPCError_Fails(t *testing.T) {
291333
t.Parallel()
292334

@@ -321,6 +363,7 @@ func TestCheckAgentYAMLValid_OneServiceValid_Passes(t *testing.T) {
321363
projectPath := t.TempDir()
322364
require.NoError(t, os.MkdirAll(filepath.Join(projectPath, "src", "agent"), 0o750))
323365
writeYAML(t, projectPath, "src/agent/agent.yaml", `
366+
kind: hosted
324367
name: echo-agent
325368
language: python
326369
entrypoint: main.py
@@ -352,7 +395,7 @@ func TestCheckAgentYAMLValid_NonAgentServicesIgnored(t *testing.T) {
352395

353396
projectPath := t.TempDir()
354397
require.NoError(t, os.MkdirAll(filepath.Join(projectPath, "src", "agent"), 0o750))
355-
writeYAML(t, projectPath, "src/agent/agent.yaml", "name: echo\nlanguage: python\n")
398+
writeYAML(t, projectPath, "src/agent/agent.yaml", "kind: hosted\nname: echo\nlanguage: python\n")
356399

357400
client := newTestAzdClient(t,
358401
&fakeProjectServer{resp: &azdext.GetProjectResponse{
@@ -438,7 +481,7 @@ func TestCheckAgentYAMLValid_MixedValidAndInvalid_Fails(t *testing.T) {
438481
projectPath := t.TempDir()
439482
require.NoError(t, os.MkdirAll(filepath.Join(projectPath, "src", "ok"), 0o750))
440483
require.NoError(t, os.MkdirAll(filepath.Join(projectPath, "src", "bad"), 0o750))
441-
writeYAML(t, projectPath, "src/ok/agent.yaml", "name: ok-agent\nlanguage: python\n")
484+
writeYAML(t, projectPath, "src/ok/agent.yaml", "kind: hosted\nname: ok-agent\nlanguage: python\n")
442485
// bad: malformed yaml (mapping key with no value, broken indent).
443486
writeYAML(t, projectPath, "src/bad/agent.yaml", "name: bad\n : nope\n\t- tabs-here\n")
444487

@@ -471,9 +514,9 @@ func TestCheckAgentYAMLValid_MixedValidAndInvalid_Fails(t *testing.T) {
471514
require.Len(t, validated, 1)
472515
}
473516

474-
// ---- helper: priorFailed ----
517+
// ---- helper: priorBlocked ----
475518

476-
func TestPriorFailed(t *testing.T) {
519+
func TestPriorBlocked(t *testing.T) {
477520
t.Parallel()
478521

479522
cases := []struct {
@@ -485,22 +528,121 @@ func TestPriorFailed(t *testing.T) {
485528
{"empty prior", nil, "x", false},
486529
{"matching fail", []Result{{ID: "x", Status: StatusFail}}, "x", true},
487530
{"matching pass", []Result{{ID: "x", Status: StatusPass}}, "x", false},
488-
{"matching skip", []Result{{ID: "x", Status: StatusSkip}}, "x", false},
531+
// Skip propagates blocking: if upstream skipped (because *its* upstream failed),
532+
// downstream checks must also skip rather than run on broken assumptions.
533+
{"matching skip", []Result{{ID: "x", Status: StatusSkip}}, "x", true},
489534
{"matching warn", []Result{{ID: "x", Status: StatusWarn}}, "x", false},
490535
{"different id fail", []Result{{ID: "y", Status: StatusFail}}, "x", false},
491-
{"id matches middle entry", []Result{
536+
{"different id skip", []Result{{ID: "y", Status: StatusSkip}}, "x", false},
537+
{"id matches middle entry fail", []Result{
492538
{ID: "a", Status: StatusPass},
493539
{ID: "x", Status: StatusFail},
494540
{ID: "c", Status: StatusPass},
495541
}, "x", true},
542+
{"id matches middle entry skip", []Result{
543+
{ID: "a", Status: StatusPass},
544+
{ID: "x", Status: StatusSkip},
545+
{ID: "c", Status: StatusPass},
546+
}, "x", true},
496547
}
497548
for _, tc := range cases {
498549
t.Run(tc.name, func(t *testing.T) {
499-
require.Equal(t, tc.want, priorFailed(tc.prior, tc.id))
550+
require.Equal(t, tc.want, priorBlocked(tc.prior, tc.id))
500551
})
501552
}
502553
}
503554

555+
func TestCheckAgentYAMLValid_MissingKind_Fails(t *testing.T) {
556+
// Without explicit `kind:`, ValidateAgentDefinition rejects the manifest
557+
// because kind is required. Doctor must catch this pre-flight rather
558+
// than letting deploy be the first place that surfaces it.
559+
t.Parallel()
560+
561+
projectPath := t.TempDir()
562+
writeYAML(t, projectPath, "src/agent/agent.yaml", "name: echo-agent\nlanguage: python\n")
563+
564+
client := newTestAzdClient(t,
565+
&fakeProjectServer{resp: &azdext.GetProjectResponse{
566+
Project: &azdext.ProjectConfig{
567+
Path: projectPath,
568+
Services: map[string]*azdext.ServiceConfig{
569+
"echo-agent": {Name: "echo-agent", Host: agentHost, RelativePath: "src/agent"},
570+
},
571+
},
572+
}},
573+
&fakeEnvironmentServer{})
574+
check := newCheckAgentYAMLValid(Dependencies{AzdClient: client})
575+
576+
got := check.Fn(t.Context(), Options{}, nil)
577+
578+
require.Equal(t, StatusFail, got.Status)
579+
require.Contains(t, got.Message, "echo-agent")
580+
failures, ok := got.Details["failures"].([]string)
581+
require.True(t, ok)
582+
require.Len(t, failures, 1)
583+
require.Contains(t, failures[0], "kind")
584+
}
585+
586+
func TestCheckAgentYAMLValid_InvalidKind_Fails(t *testing.T) {
587+
// A `kind` that isn't in ValidAgentKinds() (hosted/workflow) must be
588+
// rejected. Bare yaml.Unmarshal would silently accept this; the
589+
// production deploy path rejects it via ValidateAgentDefinition.
590+
t.Parallel()
591+
592+
projectPath := t.TempDir()
593+
writeYAML(t, projectPath, "src/agent/agent.yaml", "kind: nonsense\nname: echo-agent\nlanguage: python\n")
594+
595+
client := newTestAzdClient(t,
596+
&fakeProjectServer{resp: &azdext.GetProjectResponse{
597+
Project: &azdext.ProjectConfig{
598+
Path: projectPath,
599+
Services: map[string]*azdext.ServiceConfig{
600+
"echo-agent": {Name: "echo-agent", Host: agentHost, RelativePath: "src/agent"},
601+
},
602+
},
603+
}},
604+
&fakeEnvironmentServer{})
605+
check := newCheckAgentYAMLValid(Dependencies{AzdClient: client})
606+
607+
got := check.Fn(t.Context(), Options{}, nil)
608+
609+
require.Equal(t, StatusFail, got.Status)
610+
failures, ok := got.Details["failures"].([]string)
611+
require.True(t, ok)
612+
require.Len(t, failures, 1)
613+
require.Contains(t, failures[0], "kind")
614+
}
615+
616+
func TestCheckAgentYAMLValid_InvalidName_Fails(t *testing.T) {
617+
// Agent name must match `^[a-zA-Z0-9]([a-zA-Z0-9-]{0,61}[a-zA-Z0-9])?$`
618+
// (DNS-style). An underscore is invalid for deployable agent names.
619+
// Doctor must surface this before deploy, not after.
620+
t.Parallel()
621+
622+
projectPath := t.TempDir()
623+
writeYAML(t, projectPath, "src/agent/agent.yaml", "kind: hosted\nname: My_Agent\nlanguage: python\n")
624+
625+
client := newTestAzdClient(t,
626+
&fakeProjectServer{resp: &azdext.GetProjectResponse{
627+
Project: &azdext.ProjectConfig{
628+
Path: projectPath,
629+
Services: map[string]*azdext.ServiceConfig{
630+
"my-agent": {Name: "my-agent", Host: agentHost, RelativePath: "src/agent"},
631+
},
632+
},
633+
}},
634+
&fakeEnvironmentServer{})
635+
check := newCheckAgentYAMLValid(Dependencies{AzdClient: client})
636+
637+
got := check.Fn(t.Context(), Options{}, nil)
638+
639+
require.Equal(t, StatusFail, got.Status)
640+
failures, ok := got.Details["failures"].([]string)
641+
require.True(t, ok)
642+
require.Len(t, failures, 1)
643+
require.Contains(t, failures[0], "name")
644+
}
645+
504646
// writeYAML is a tiny test helper that writes the given content to
505647
// <root>/<rel> after ensuring the parent directory exists.
506648
func writeYAML(t *testing.T, root, rel, content string) {

0 commit comments

Comments
 (0)