Skip to content
Open
Show file tree
Hide file tree
Changes from 2 commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
114 changes: 112 additions & 2 deletions eng/skill-validator/src/Commands/ValidateCommand.cs
Original file line number Diff line number Diff line change
Expand Up @@ -30,6 +30,9 @@ public static RootCommand Create()
var reporterOpt = new Option<string[]>("--reporter") { Description = "Reporter (console, json, junit, markdown). Can be repeated.", AllowMultipleArgumentsPerToken = true };
var noOverfittingCheckOpt = new Option<bool>("--no-overfitting-check") { Description = "Disable LLM-based overfitting analysis (on by default)" };
var overfittingFixOpt = new Option<bool>("--overfitting-fix") { Description = "Generate a fixed eval.yaml with improved rubric items/assertions" };
var selectivityTestOpt = new Option<bool>("--selectivity-test") { Description = "Run selectivity test using should_activate / should_not_activate prompts from eval.yaml" };
var selectivityMinRecallOpt = new Option<double>("--selectivity-min-recall") { Description = "Minimum recall (activation on should_activate prompts) to pass (0-1)", DefaultValueFactory = _ => 0.8 };
var selectivityMinPrecisionOpt = new Option<double>("--selectivity-min-precision") { Description = "Minimum precision (non-activation on should_not_activate prompts) to pass (0-1)", DefaultValueFactory = _ => 0.8 };

var command = new RootCommand("Validate that agent skills meaningfully improve agent performance")
{
Expand All @@ -53,6 +56,9 @@ public static RootCommand Create()
reporterOpt,
noOverfittingCheckOpt,
overfittingFixOpt,
selectivityTestOpt,
selectivityMinRecallOpt,
selectivityMinPrecisionOpt,
};

command.SetAction(async (parseResult, _) =>
Expand Down Expand Up @@ -98,6 +104,9 @@ public static RootCommand Create()
TestsDir = parseResult.GetValue(testsDirOpt),
OverfittingCheck = !parseResult.GetValue(noOverfittingCheckOpt),
OverfittingFix = parseResult.GetValue(overfittingFixOpt),
SelectivityTest = parseResult.GetValue(selectivityTestOpt),
SelectivityMinRecall = parseResult.GetValue(selectivityMinRecallOpt),
SelectivityMinPrecision = parseResult.GetValue(selectivityMinPrecisionOpt),
};

return await Run(config);
Expand Down Expand Up @@ -333,6 +342,36 @@ internal static List<string> CheckAggregateDescriptionLimits(IReadOnlyList<Skill
};
}

// Selectivity-only mode: skip full evaluation, just probe skill activation
if (config.SelectivityTest)
{
if (skill.EvalConfig is not null
&& (skill.EvalConfig.ShouldActivatePrompts is { Count: > 0 } || skill.EvalConfig.ShouldNotActivatePrompts is { Count: > 0 }))
{
log("🎯 Running selectivity test (standalone)...");
var selectivityResult = await ExecuteSelectivityTest(skill, config, spinner);
log($"🎯 Selectivity: recall={selectivityResult.Recall:P0}, precision={selectivityResult.Precision:P0} β€” {(selectivityResult.Passed ? "PASSED" : "FAILED")}");

return new SkillVerdict
{
SkillName = skill.Name,
SkillPath = skill.Path,
Passed = selectivityResult.Passed,
Scenarios = [],
OverallImprovementScore = 0,
Reason = selectivityResult.Passed
? "Selectivity test passed"
: $"Selectivity test failed: {selectivityResult.Reason}",
FailureKind = selectivityResult.Passed ? null : "selectivity_failure",
ProfileWarnings = profile.Warnings,
SelectivityResult = selectivityResult,
};
}

log("⏭ Skipping (no selectivity prompts in eval.yaml)");
return null;
}
Comment on lines +427 to +455
Copy link

Copilot AI Mar 10, 2026

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

In --selectivity-test mode this code runs only after the earlier if (skill.EvalConfig.Scenarios.Count == 0) ... return null; guard, so selectivity testing is currently impossible for a skill that provides only selectivity prompts (or has an empty scenarios: list). If selectivity-only eval.yaml files are intended, move the scenario-count skip below the selectivity branch (or only enforce scenarios when not in selectivity mode).

Copilot uses AI. Check for mistakes.

// Launch overfitting check in parallel with scenario execution
var workDir = Path.GetTempPath();
Task<OverfittingResult?> overfittingTask = Task.FromResult<OverfittingResult?>(null);
Expand Down Expand Up @@ -496,8 +535,8 @@ private static async Task<RunExecutionResult> ExecuteRun(
runLog("running agents...");

var agentTasks = await Task.WhenAll(
AgentRunner.RunAgent(new RunOptions(scenario, null, skill.EvalPath, config.Model, config.Verbose, runLog)),
AgentRunner.RunAgent(new RunOptions(scenario, skill, skill.EvalPath, config.Model, config.Verbose, runLog)));
AgentRunner.RunAgent(new RunOptions(scenario, null, skill.EvalPath, config.Model, config.Verbose, Log: runLog)),
AgentRunner.RunAgent(new RunOptions(scenario, skill, skill.EvalPath, config.Model, config.Verbose, Log: runLog)));
var baselineMetrics = agentTasks[0];
var withSkillMetrics = agentTasks[1];

Expand Down Expand Up @@ -642,4 +681,75 @@ private static string SanitizeErrorMessage(string? message)
var singleLine = raw.ReplaceLineEndings(" ");
return singleLine.Length > 150 ? singleLine[..150] + "…" : singleLine;
}

private static async Task<SelectivityResult> ExecuteSelectivityTest(SkillInfo skill, ValidatorConfig config, Spinner spinner)
{
var prefix = $"[{skill.Name}/selectivity]";
var log = (string msg) => spinner.Log($"{prefix} {msg}");

// Launch all probes in parallel
Copy link
Copy Markdown
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This can quickly lead to throttling/rejections from the inference api

var tasks = new List<Task<SelectivityPromptResult>>();

if (skill.EvalConfig!.ShouldActivatePrompts is { } activatePrompts)
{
foreach (var prompt in activatePrompts)
{
log($"Testing should_activate: \"{Truncate(prompt, 60)}\"");
tasks.Add(ProbeAndLog(skill, prompt, expectedActivation: true, config, log));
}
}

if (skill.EvalConfig.ShouldNotActivatePrompts is { } deactivatePrompts)
{
foreach (var prompt in deactivatePrompts)
{
log($"Testing should_not_activate: \"{Truncate(prompt, 60)}\"");
tasks.Add(ProbeAndLog(skill, prompt, expectedActivation: false, config, log));
}
}

var promptResults = (await Task.WhenAll(tasks)).ToList();

Comment on lines +772 to +794
Copy link

Copilot AI Mar 10, 2026

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

ExecuteSelectivityTest launches one agent session per prompt and awaits them all at once; with larger prompt lists this can create a burst of concurrent sessions and hit rate limits / overwhelm the host. Consider adding a concurrency limiter (similar to scenarios/runs) or reusing an existing Parallel* setting to cap parallel probes.

Copilot uses AI. Check for mistakes.
// Calculate recall: fraction of should_activate prompts that actually activated
var shouldActivateResults = promptResults.Where(r => r.ExpectedActivation).ToList();
double recall = shouldActivateResults.Count > 0
? (double)shouldActivateResults.Count(r => r.SkillActivated) / shouldActivateResults.Count
: 1.0;

// Calculate precision: fraction of should_not_activate prompts that correctly did NOT activate
var shouldNotActivateResults = promptResults.Where(r => !r.ExpectedActivation).ToList();
double precision = shouldNotActivateResults.Count > 0
? (double)shouldNotActivateResults.Count(r => !r.SkillActivated) / shouldNotActivateResults.Count
Comment on lines +801 to +804
Copy link

Copilot AI Mar 10, 2026

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The reported precision is currently computed as the fraction of should_not_activate prompts that did not activate (i.e., true negative rate/specificity), not standard precision (TP/(TP+FP)). Either rename the metric/flags (--selectivity-min-*) to match what’s actually measured, or change the calculation to true precision to avoid confusing results and thresholds.

Suggested change
// Calculate precision: fraction of should_not_activate prompts that correctly did NOT activate
var shouldNotActivateResults = promptResults.Where(r => !r.ExpectedActivation).ToList();
double precision = shouldNotActivateResults.Count > 0
? (double)shouldNotActivateResults.Count(r => !r.SkillActivated) / shouldNotActivateResults.Count
// Calculate precision: fraction of activations that were expected (TP / (TP + FP))
var shouldNotActivateResults = promptResults.Where(r => !r.ExpectedActivation).ToList();
var truePositives = shouldActivateResults.Count(r => r.SkillActivated);
var falsePositives = shouldNotActivateResults.Count(r => r.SkillActivated);
double precision = (truePositives + falsePositives) > 0
? (double)truePositives / (truePositives + falsePositives)

Copilot uses AI. Check for mistakes.
: 1.0;

bool passed = recall >= config.SelectivityMinRecall && precision >= config.SelectivityMinPrecision;
var reasons = new List<string>();
if (recall < config.SelectivityMinRecall)
reasons.Add($"Recall {recall:P0} below threshold {config.SelectivityMinRecall:P0}");
if (precision < config.SelectivityMinPrecision)
reasons.Add($"Precision {precision:P0} below threshold {config.SelectivityMinPrecision:P0}");
string reason = passed ? "Selectivity test passed" : string.Join("; ", reasons);

return new SelectivityResult(promptResults, recall, precision, passed, reason);
}

private static async Task<SelectivityPromptResult> ProbeAndLog(
SkillInfo skill, string prompt, bool expectedActivation, ValidatorConfig config, Action<string> log)
{
var activated = await TestSkillActivation(skill, prompt, config);
if (expectedActivation)
log($" β†’ {(activated ? "βœ… activated" : "❌ NOT activated")}: \"{Truncate(prompt, 50)}\"");
else
log($" β†’ {(activated ? "❌ activated (unexpected)" : "βœ… correctly NOT activated")}: \"{Truncate(prompt, 50)}\"");
return new SelectivityPromptResult(prompt, ExpectedActivation: expectedActivation, SkillActivated: activated);
}

private static async Task<bool> TestSkillActivation(SkillInfo skill, string prompt, ValidatorConfig config)
{
var scenario = new EvalScenario(Name: "selectivity-probe", Prompt: prompt, Rubric: [], Timeout: 15);
return await AgentRunner.ProbeSkillActivation(new RunOptions(scenario, skill, skill.EvalPath, config.Model, config.Verbose));
}

private static string Truncate(string value, int maxLength) =>
value.Length <= maxLength ? value : value[..(maxLength - 1)] + "…";
}
23 changes: 22 additions & 1 deletion eng/skill-validator/src/Models/Models.cs
Original file line number Diff line number Diff line change
Expand Up @@ -70,7 +70,10 @@ public sealed record EvalScenario(
int? MaxTokens = null,
bool ExpectActivation = true);

public sealed record EvalConfig(IReadOnlyList<EvalScenario> Scenarios);
public sealed record EvalConfig(
IReadOnlyList<EvalScenario> Scenarios,
IReadOnlyList<string>? ShouldActivatePrompts = null,
IReadOnlyList<string>? ShouldNotActivatePrompts = null);

// --- Skill info ---

Expand Down Expand Up @@ -227,6 +230,7 @@ public sealed class SkillVerdict
public IReadOnlyList<string>? ProfileWarnings { get; set; }
public bool SkillNotActivated { get; set; }
public OverfittingResult? OverfittingResult { get; set; }
public SelectivityResult? SelectivityResult { get; set; }
}

// --- Overfitting assessment ---
Expand Down Expand Up @@ -274,6 +278,20 @@ public sealed record OverfittingJudgeOptions(
int Timeout,
string WorkDir);

// --- Selectivity test ---

public sealed record SelectivityPromptResult(
string Prompt,
bool ExpectedActivation,
bool SkillActivated);

public sealed record SelectivityResult(
IReadOnlyList<SelectivityPromptResult> PromptResults,
double Recall,
double Precision,
bool Passed,
string Reason);

// --- Config ---

public sealed record ReporterSpec(ReporterType Type);
Expand Down Expand Up @@ -308,6 +326,9 @@ public sealed record ValidatorConfig
public string? TestsDir { get; init; }
public bool OverfittingCheck { get; init; } = true;
public bool OverfittingFix { get; init; }
public bool SelectivityTest { get; init; }
public double SelectivityMinRecall { get; init; } = 0.8;
public double SelectivityMinPrecision { get; init; } = 0.8;
}

public static class DefaultWeights
Expand Down
71 changes: 71 additions & 0 deletions eng/skill-validator/src/Services/AgentRunner.cs
Original file line number Diff line number Diff line change
Expand Up @@ -287,6 +287,77 @@ public static async Task<RunMetrics> RunAgent(RunOptions options)
return metrics;
}

/// <summary>
/// Lightweight probe that sends a prompt and checks whether the skill is activated.
/// Exits immediately when a SkillInvokedEvent is seen, or waits for the session to
/// complete/timeout. Designed to run many probes in parallel via Task.WhenAll.
/// </summary>
public static async Task<bool> ProbeSkillActivation(RunOptions options)
{
var workDir = Path.Combine(Path.GetTempPath(), $"sv-{Guid.NewGuid():N}");
Directory.CreateDirectory(workDir);
_workDirs.Add(workDir);

if (options.Verbose)
{
var write = options.Log ?? (msg => Console.Error.WriteLine(msg));
write($" πŸ“‚ {workDir} (skilled)");
}

bool skillActivated = false;
var done = new TaskCompletionSource<bool>();

try
{
var client = await GetSharedClient(options.Verbose);
await using var session = await client.CreateSessionAsync(
BuildSessionConfig(options.Skill, options.Model, workDir, options.Skill?.McpServers));

// 30s timeout β€” enough for the agent to reach the skill-loading decision
using var cts = new CancellationTokenSource(30_000);
cts.Token.Register(() => done.TrySetResult(skillActivated));

session.On(evt =>
{
switch (evt)
{
// Skill loaded β†’ we have our answer, bail immediately
case SkillInvokedEvent:
skillActivated = true;
done.TrySetResult(true);
break;

// Session finished without loading the skill β†’ not activated
case SessionIdleEvent:
done.TrySetResult(skillActivated);
break;

case SessionErrorEvent err:
done.TrySetException(new InvalidOperationException(err.Data.Message ?? "Session error"));
break;
}

if (options.Verbose && evt is SkillInvokedEvent si)
{
var write = options.Log ?? (m => Console.Error.WriteLine(m));
write($" πŸ“˜ Skill invoked: {si.Data.Name}");
}
if (options.Verbose && evt is ToolExecutionStartEvent ts)
{
var write = options.Log ?? (m => Console.Error.WriteLine(m));
write($" πŸ”§ {ts.Data.ToolName}");
}
});

await session.SendAsync(new MessageOptions { Prompt = options.Scenario.Prompt });
return await done.Task;
Comment on lines +340 to +377
Copy link

Copilot AI Mar 10, 2026

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

ProbeSkillActivation hard-codes a 30s timeout (new CancellationTokenSource(30_000)) even though the caller creates an EvalScenario with Timeout: 15. This makes probe duration inconsistent with scenario configuration. Consider using options.Scenario.Timeout (or a dedicated config value) and passing the cancellation token through to SendAsync/session operations so probes reliably terminate on timeout.

Copilot uses AI. Check for mistakes.
}
catch
{
return skillActivated;
}
Comment on lines +379 to +382
Copy link

Copilot AI Mar 10, 2026

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The blanket catch { return skillActivated; } will silently treat session creation/SendAsync failures as β€œnot activated”, which can produce misleading selectivity results and hide infrastructure/model errors. Consider letting exceptions propagate (so selectivity fails with a clear error) or returning a richer result that distinguishes β€œnot activated” from β€œprobe failed”.

Copilot uses AI. Check for mistakes.
}

private static async Task<string> SetupWorkDir(EvalScenario scenario, string? skillPath, string? evalPath)
{
var workDir = Path.Combine(Path.GetTempPath(), $"sv-{Guid.NewGuid():N}");
Expand Down
11 changes: 10 additions & 1 deletion eng/skill-validator/src/Services/EvalSchema.cs
Original file line number Diff line number Diff line change
Expand Up @@ -21,7 +21,7 @@ public static EvalConfig ParseEvalConfig(string yamlContent)
if (scenarios is not { Count: > 0 })
throw new InvalidOperationException("Eval config must have at least one scenario");

return new EvalConfig(scenarios);
return new EvalConfig(scenarios, raw.Selectivity?.ShouldActivate, raw.Selectivity?.ShouldNotActivate);
Copy link

Copilot AI Mar 10, 2026

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

No tests currently cover parsing/round-tripping the new selectivity.should_activate / selectivity.should_not_activate fields. Adding a unit test in EvalSchemaTests that asserts these lists are parsed into EvalConfig (and validates expected behavior when scenarios are empty/missing, if supported) would help prevent regressions.

Copilot uses AI. Check for mistakes.
Comment on lines 19 to +24
Copy link

Copilot AI Mar 10, 2026

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

ParseEvalConfig still throws when scenarios is missing/empty, which prevents using --selectivity-test with an eval.yaml that only defines selectivity.should_activate/should_not_activate (as described in the PR). Consider allowing zero scenarios when selectivity prompts are present (or relaxing this only when running selectivity mode) so the lightweight probe can run without requiring full scenarios.

Copilot uses AI. Check for mistakes.
}

public static (bool Success, EvalConfig? Data, IReadOnlyList<string>? Errors) ValidateEvalConfig(string yamlContent)
Expand Down Expand Up @@ -122,6 +122,15 @@ internal sealed class RawFrontmatter
internal sealed class RawEvalConfig
{
public List<RawScenario>? Scenarios { get; set; }
public RawSelectivity? Selectivity { get; set; }
}

internal sealed class RawSelectivity
{
[YamlMember(Alias = "should_activate")]
public List<string>? ShouldActivate { get; set; }
[YamlMember(Alias = "should_not_activate")]
public List<string>? ShouldNotActivate { get; set; }
}

internal sealed class RawScenario
Expand Down
51 changes: 39 additions & 12 deletions eng/skill-validator/src/Services/Reporter.cs
Original file line number Diff line number Diff line change
Expand Up @@ -62,22 +62,34 @@ private static void ReportConsole(IReadOnlyList<SkillVerdict> verdicts, bool ver
{
var icon = verdict.Passed ? "\x1b[32mβœ“\x1b[0m" : "\x1b[31mβœ—\x1b[0m";
var name = $"\x1b[1m{verdict.SkillName}\x1b[0m";
var score = FormatScore(verdict.OverallImprovementScore);

var scoreLine = $"{icon} {name} {score}";
if (verdict.ConfidenceInterval is { } ci)
// Selectivity-only verdict: no scenarios or score to display
bool isSelectivityOnly = verdict.Scenarios.Count == 0 && verdict.SelectivityResult is not null;

if (isSelectivityOnly)
{
var ciStr = $"[{FormatPct(ci.Low)}, {FormatPct(ci.High)}]";
var sigStr = verdict.IsSignificant == true
? "\x1b[32msignificant\x1b[0m"
: "\x1b[33mnot significant\x1b[0m";
scoreLine += $" \x1b[2m{ciStr}\x1b[0m {sigStr}";
Console.WriteLine($"{icon} {name} \x1b[2m(selectivity only)\x1b[0m");
Console.WriteLine($" \x1b[2m{verdict.Reason}\x1b[0m");
}
if (verdict.NormalizedGain is { } ng)
scoreLine += $" \x1b[2m(g={FormatPct(ng)})\x1b[0m";
else
{
var score = FormatScore(verdict.OverallImprovementScore);

var scoreLine = $"{icon} {name} {score}";
if (verdict.ConfidenceInterval is { } ci)
{
var ciStr = $"[{FormatPct(ci.Low)}, {FormatPct(ci.High)}]";
var sigStr = verdict.IsSignificant == true
? "\x1b[32msignificant\x1b[0m"
: "\x1b[33mnot significant\x1b[0m";
scoreLine += $" \x1b[2m{ciStr}\x1b[0m {sigStr}";
}
if (verdict.NormalizedGain is { } ng)
scoreLine += $" \x1b[2m(g={FormatPct(ng)})\x1b[0m";

Console.WriteLine(scoreLine);
Console.WriteLine($" \x1b[2m{verdict.Reason}\x1b[0m");
Console.WriteLine(scoreLine);
Console.WriteLine($" \x1b[2m{verdict.Reason}\x1b[0m");
}

if (!verdict.Passed && verdict.ProfileWarnings is { Count: > 0 })
{
Expand Down Expand Up @@ -132,6 +144,21 @@ private static void ReportConsole(IReadOnlyList<SkillVerdict> verdicts, bool ver
Console.WriteLine($" \x1b[2mβ€’\x1b[0m [{item.Classification}] \x1b[2m{item.AssertionSummary}\x1b[0m\n \x1b[2mβ€” {item.Reasoning}\x1b[0m");
}
}
if (verdict.SelectivityResult is { } selResult)
{
Console.WriteLine();
var selIcon = selResult.Passed ? "βœ…" : "πŸ”΄";
Console.WriteLine($" 🎯 Selectivity: recall={selResult.Recall:P0}, precision={selResult.Precision:P0} {selIcon}");
foreach (var pr in selResult.PromptResults)
{
var expected = pr.ExpectedActivation ? "should activate" : "should NOT activate";
var correct = (pr.ExpectedActivation == pr.SkillActivated);
var prIcon = correct ? "\x1b[32mβœ“\x1b[0m" : "\x1b[31mβœ—\x1b[0m";
var activatedStr = pr.SkillActivated ? "activated" : "not activated";
var prompt = pr.Prompt.Length > 60 ? pr.Prompt[..59] + "…" : pr.Prompt;
Console.WriteLine($" {prIcon} \x1b[2m\"{prompt}\" β€” {expected} β†’ {activatedStr}\x1b[0m");
}
}
if (verdict.Scenarios.Count > 0)
{
Console.WriteLine();
Expand Down
Loading
Loading