-
Notifications
You must be signed in to change notification settings - Fork 96
feat: skill loading tests #311
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. Weβll occasionally send you account related emails.
Already on GitHub? Sign in to your account
base: main
Are you sure you want to change the base?
Changes from 2 commits
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change | ||||||||||||||||||||
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
|
|
@@ -30,6 +30,9 @@ public static RootCommand Create() | |||||||||||||||||||||
| var reporterOpt = new Option<string[]>("--reporter") { Description = "Reporter (console, json, junit, markdown). Can be repeated.", AllowMultipleArgumentsPerToken = true }; | ||||||||||||||||||||||
| var noOverfittingCheckOpt = new Option<bool>("--no-overfitting-check") { Description = "Disable LLM-based overfitting analysis (on by default)" }; | ||||||||||||||||||||||
| var overfittingFixOpt = new Option<bool>("--overfitting-fix") { Description = "Generate a fixed eval.yaml with improved rubric items/assertions" }; | ||||||||||||||||||||||
| var selectivityTestOpt = new Option<bool>("--selectivity-test") { Description = "Run selectivity test using should_activate / should_not_activate prompts from eval.yaml" }; | ||||||||||||||||||||||
| var selectivityMinRecallOpt = new Option<double>("--selectivity-min-recall") { Description = "Minimum recall (activation on should_activate prompts) to pass (0-1)", DefaultValueFactory = _ => 0.8 }; | ||||||||||||||||||||||
| var selectivityMinPrecisionOpt = new Option<double>("--selectivity-min-precision") { Description = "Minimum precision (non-activation on should_not_activate prompts) to pass (0-1)", DefaultValueFactory = _ => 0.8 }; | ||||||||||||||||||||||
|
|
||||||||||||||||||||||
| var command = new RootCommand("Validate that agent skills meaningfully improve agent performance") | ||||||||||||||||||||||
| { | ||||||||||||||||||||||
|
|
@@ -53,6 +56,9 @@ public static RootCommand Create() | |||||||||||||||||||||
| reporterOpt, | ||||||||||||||||||||||
| noOverfittingCheckOpt, | ||||||||||||||||||||||
| overfittingFixOpt, | ||||||||||||||||||||||
| selectivityTestOpt, | ||||||||||||||||||||||
| selectivityMinRecallOpt, | ||||||||||||||||||||||
| selectivityMinPrecisionOpt, | ||||||||||||||||||||||
| }; | ||||||||||||||||||||||
|
|
||||||||||||||||||||||
| command.SetAction(async (parseResult, _) => | ||||||||||||||||||||||
|
|
@@ -98,6 +104,9 @@ public static RootCommand Create() | |||||||||||||||||||||
| TestsDir = parseResult.GetValue(testsDirOpt), | ||||||||||||||||||||||
| OverfittingCheck = !parseResult.GetValue(noOverfittingCheckOpt), | ||||||||||||||||||||||
| OverfittingFix = parseResult.GetValue(overfittingFixOpt), | ||||||||||||||||||||||
| SelectivityTest = parseResult.GetValue(selectivityTestOpt), | ||||||||||||||||||||||
| SelectivityMinRecall = parseResult.GetValue(selectivityMinRecallOpt), | ||||||||||||||||||||||
| SelectivityMinPrecision = parseResult.GetValue(selectivityMinPrecisionOpt), | ||||||||||||||||||||||
| }; | ||||||||||||||||||||||
|
|
||||||||||||||||||||||
| return await Run(config); | ||||||||||||||||||||||
|
|
@@ -333,6 +342,36 @@ internal static List<string> CheckAggregateDescriptionLimits(IReadOnlyList<Skill | |||||||||||||||||||||
| }; | ||||||||||||||||||||||
| } | ||||||||||||||||||||||
|
|
||||||||||||||||||||||
| // Selectivity-only mode: skip full evaluation, just probe skill activation | ||||||||||||||||||||||
| if (config.SelectivityTest) | ||||||||||||||||||||||
| { | ||||||||||||||||||||||
| if (skill.EvalConfig is not null | ||||||||||||||||||||||
| && (skill.EvalConfig.ShouldActivatePrompts is { Count: > 0 } || skill.EvalConfig.ShouldNotActivatePrompts is { Count: > 0 })) | ||||||||||||||||||||||
| { | ||||||||||||||||||||||
| log("π― Running selectivity test (standalone)..."); | ||||||||||||||||||||||
| var selectivityResult = await ExecuteSelectivityTest(skill, config, spinner); | ||||||||||||||||||||||
| log($"π― Selectivity: recall={selectivityResult.Recall:P0}, precision={selectivityResult.Precision:P0} β {(selectivityResult.Passed ? "PASSED" : "FAILED")}"); | ||||||||||||||||||||||
|
|
||||||||||||||||||||||
| return new SkillVerdict | ||||||||||||||||||||||
| { | ||||||||||||||||||||||
| SkillName = skill.Name, | ||||||||||||||||||||||
| SkillPath = skill.Path, | ||||||||||||||||||||||
| Passed = selectivityResult.Passed, | ||||||||||||||||||||||
| Scenarios = [], | ||||||||||||||||||||||
| OverallImprovementScore = 0, | ||||||||||||||||||||||
| Reason = selectivityResult.Passed | ||||||||||||||||||||||
| ? "Selectivity test passed" | ||||||||||||||||||||||
| : $"Selectivity test failed: {selectivityResult.Reason}", | ||||||||||||||||||||||
| FailureKind = selectivityResult.Passed ? null : "selectivity_failure", | ||||||||||||||||||||||
| ProfileWarnings = profile.Warnings, | ||||||||||||||||||||||
| SelectivityResult = selectivityResult, | ||||||||||||||||||||||
| }; | ||||||||||||||||||||||
| } | ||||||||||||||||||||||
|
|
||||||||||||||||||||||
| log("β Skipping (no selectivity prompts in eval.yaml)"); | ||||||||||||||||||||||
| return null; | ||||||||||||||||||||||
| } | ||||||||||||||||||||||
|
|
||||||||||||||||||||||
| // Launch overfitting check in parallel with scenario execution | ||||||||||||||||||||||
| var workDir = Path.GetTempPath(); | ||||||||||||||||||||||
| Task<OverfittingResult?> overfittingTask = Task.FromResult<OverfittingResult?>(null); | ||||||||||||||||||||||
|
|
@@ -496,8 +535,8 @@ private static async Task<RunExecutionResult> ExecuteRun( | |||||||||||||||||||||
| runLog("running agents..."); | ||||||||||||||||||||||
|
|
||||||||||||||||||||||
| var agentTasks = await Task.WhenAll( | ||||||||||||||||||||||
| AgentRunner.RunAgent(new RunOptions(scenario, null, skill.EvalPath, config.Model, config.Verbose, runLog)), | ||||||||||||||||||||||
| AgentRunner.RunAgent(new RunOptions(scenario, skill, skill.EvalPath, config.Model, config.Verbose, runLog))); | ||||||||||||||||||||||
| AgentRunner.RunAgent(new RunOptions(scenario, null, skill.EvalPath, config.Model, config.Verbose, Log: runLog)), | ||||||||||||||||||||||
| AgentRunner.RunAgent(new RunOptions(scenario, skill, skill.EvalPath, config.Model, config.Verbose, Log: runLog))); | ||||||||||||||||||||||
| var baselineMetrics = agentTasks[0]; | ||||||||||||||||||||||
| var withSkillMetrics = agentTasks[1]; | ||||||||||||||||||||||
|
|
||||||||||||||||||||||
|
|
@@ -642,4 +681,75 @@ private static string SanitizeErrorMessage(string? message) | |||||||||||||||||||||
| var singleLine = raw.ReplaceLineEndings(" "); | ||||||||||||||||||||||
| return singleLine.Length > 150 ? singleLine[..150] + "β¦" : singleLine; | ||||||||||||||||||||||
| } | ||||||||||||||||||||||
|
|
||||||||||||||||||||||
| private static async Task<SelectivityResult> ExecuteSelectivityTest(SkillInfo skill, ValidatorConfig config, Spinner spinner) | ||||||||||||||||||||||
| { | ||||||||||||||||||||||
| var prefix = $"[{skill.Name}/selectivity]"; | ||||||||||||||||||||||
| var log = (string msg) => spinner.Log($"{prefix} {msg}"); | ||||||||||||||||||||||
|
|
||||||||||||||||||||||
| // Launch all probes in parallel | ||||||||||||||||||||||
|
Member
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. This can quickly lead to throttling/rejections from the inference api |
||||||||||||||||||||||
| var tasks = new List<Task<SelectivityPromptResult>>(); | ||||||||||||||||||||||
|
|
||||||||||||||||||||||
| if (skill.EvalConfig!.ShouldActivatePrompts is { } activatePrompts) | ||||||||||||||||||||||
| { | ||||||||||||||||||||||
| foreach (var prompt in activatePrompts) | ||||||||||||||||||||||
| { | ||||||||||||||||||||||
| log($"Testing should_activate: \"{Truncate(prompt, 60)}\""); | ||||||||||||||||||||||
| tasks.Add(ProbeAndLog(skill, prompt, expectedActivation: true, config, log)); | ||||||||||||||||||||||
| } | ||||||||||||||||||||||
| } | ||||||||||||||||||||||
|
|
||||||||||||||||||||||
| if (skill.EvalConfig.ShouldNotActivatePrompts is { } deactivatePrompts) | ||||||||||||||||||||||
| { | ||||||||||||||||||||||
| foreach (var prompt in deactivatePrompts) | ||||||||||||||||||||||
| { | ||||||||||||||||||||||
| log($"Testing should_not_activate: \"{Truncate(prompt, 60)}\""); | ||||||||||||||||||||||
| tasks.Add(ProbeAndLog(skill, prompt, expectedActivation: false, config, log)); | ||||||||||||||||||||||
| } | ||||||||||||||||||||||
| } | ||||||||||||||||||||||
|
|
||||||||||||||||||||||
| var promptResults = (await Task.WhenAll(tasks)).ToList(); | ||||||||||||||||||||||
|
|
||||||||||||||||||||||
|
Comment on lines
+772
to
+794
|
||||||||||||||||||||||
| // Calculate recall: fraction of should_activate prompts that actually activated | ||||||||||||||||||||||
| var shouldActivateResults = promptResults.Where(r => r.ExpectedActivation).ToList(); | ||||||||||||||||||||||
| double recall = shouldActivateResults.Count > 0 | ||||||||||||||||||||||
| ? (double)shouldActivateResults.Count(r => r.SkillActivated) / shouldActivateResults.Count | ||||||||||||||||||||||
| : 1.0; | ||||||||||||||||||||||
|
|
||||||||||||||||||||||
| // Calculate precision: fraction of should_not_activate prompts that correctly did NOT activate | ||||||||||||||||||||||
| var shouldNotActivateResults = promptResults.Where(r => !r.ExpectedActivation).ToList(); | ||||||||||||||||||||||
| double precision = shouldNotActivateResults.Count > 0 | ||||||||||||||||||||||
| ? (double)shouldNotActivateResults.Count(r => !r.SkillActivated) / shouldNotActivateResults.Count | ||||||||||||||||||||||
|
Comment on lines
+801
to
+804
|
||||||||||||||||||||||
| // Calculate precision: fraction of should_not_activate prompts that correctly did NOT activate | |
| var shouldNotActivateResults = promptResults.Where(r => !r.ExpectedActivation).ToList(); | |
| double precision = shouldNotActivateResults.Count > 0 | |
| ? (double)shouldNotActivateResults.Count(r => !r.SkillActivated) / shouldNotActivateResults.Count | |
| // Calculate precision: fraction of activations that were expected (TP / (TP + FP)) | |
| var shouldNotActivateResults = promptResults.Where(r => !r.ExpectedActivation).ToList(); | |
| var truePositives = shouldActivateResults.Count(r => r.SkillActivated); | |
| var falsePositives = shouldNotActivateResults.Count(r => r.SkillActivated); | |
| double precision = (truePositives + falsePositives) > 0 | |
| ? (double)truePositives / (truePositives + falsePositives) |
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -287,6 +287,77 @@ public static async Task<RunMetrics> RunAgent(RunOptions options) | |
| return metrics; | ||
| } | ||
|
|
||
| /// <summary> | ||
| /// Lightweight probe that sends a prompt and checks whether the skill is activated. | ||
| /// Exits immediately when a SkillInvokedEvent is seen, or waits for the session to | ||
| /// complete/timeout. Designed to run many probes in parallel via Task.WhenAll. | ||
| /// </summary> | ||
| public static async Task<bool> ProbeSkillActivation(RunOptions options) | ||
| { | ||
| var workDir = Path.Combine(Path.GetTempPath(), $"sv-{Guid.NewGuid():N}"); | ||
| Directory.CreateDirectory(workDir); | ||
| _workDirs.Add(workDir); | ||
|
|
||
| if (options.Verbose) | ||
| { | ||
| var write = options.Log ?? (msg => Console.Error.WriteLine(msg)); | ||
| write($" π {workDir} (skilled)"); | ||
| } | ||
|
|
||
| bool skillActivated = false; | ||
| var done = new TaskCompletionSource<bool>(); | ||
|
|
||
| try | ||
| { | ||
| var client = await GetSharedClient(options.Verbose); | ||
| await using var session = await client.CreateSessionAsync( | ||
| BuildSessionConfig(options.Skill, options.Model, workDir, options.Skill?.McpServers)); | ||
|
|
||
| // 30s timeout β enough for the agent to reach the skill-loading decision | ||
| using var cts = new CancellationTokenSource(30_000); | ||
| cts.Token.Register(() => done.TrySetResult(skillActivated)); | ||
|
|
||
| session.On(evt => | ||
| { | ||
| switch (evt) | ||
| { | ||
| // Skill loaded β we have our answer, bail immediately | ||
| case SkillInvokedEvent: | ||
| skillActivated = true; | ||
| done.TrySetResult(true); | ||
| break; | ||
|
|
||
| // Session finished without loading the skill β not activated | ||
| case SessionIdleEvent: | ||
| done.TrySetResult(skillActivated); | ||
| break; | ||
|
|
||
| case SessionErrorEvent err: | ||
| done.TrySetException(new InvalidOperationException(err.Data.Message ?? "Session error")); | ||
| break; | ||
| } | ||
|
|
||
| if (options.Verbose && evt is SkillInvokedEvent si) | ||
| { | ||
| var write = options.Log ?? (m => Console.Error.WriteLine(m)); | ||
| write($" π Skill invoked: {si.Data.Name}"); | ||
| } | ||
| if (options.Verbose && evt is ToolExecutionStartEvent ts) | ||
| { | ||
| var write = options.Log ?? (m => Console.Error.WriteLine(m)); | ||
| write($" π§ {ts.Data.ToolName}"); | ||
| } | ||
| }); | ||
|
|
||
| await session.SendAsync(new MessageOptions { Prompt = options.Scenario.Prompt }); | ||
| return await done.Task; | ||
|
Comment on lines
+340
to
+377
|
||
| } | ||
| catch | ||
| { | ||
| return skillActivated; | ||
| } | ||
|
Comment on lines
+379
to
+382
|
||
| } | ||
|
|
||
| private static async Task<string> SetupWorkDir(EvalScenario scenario, string? skillPath, string? evalPath) | ||
| { | ||
| var workDir = Path.Combine(Path.GetTempPath(), $"sv-{Guid.NewGuid():N}"); | ||
|
|
||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -21,7 +21,7 @@ public static EvalConfig ParseEvalConfig(string yamlContent) | |
| if (scenarios is not { Count: > 0 }) | ||
| throw new InvalidOperationException("Eval config must have at least one scenario"); | ||
|
|
||
| return new EvalConfig(scenarios); | ||
| return new EvalConfig(scenarios, raw.Selectivity?.ShouldActivate, raw.Selectivity?.ShouldNotActivate); | ||
|
||
| } | ||
|
|
||
| public static (bool Success, EvalConfig? Data, IReadOnlyList<string>? Errors) ValidateEvalConfig(string yamlContent) | ||
|
|
@@ -122,6 +122,15 @@ internal sealed class RawFrontmatter | |
| internal sealed class RawEvalConfig | ||
| { | ||
| public List<RawScenario>? Scenarios { get; set; } | ||
| public RawSelectivity? Selectivity { get; set; } | ||
| } | ||
|
|
||
| internal sealed class RawSelectivity | ||
| { | ||
| [YamlMember(Alias = "should_activate")] | ||
| public List<string>? ShouldActivate { get; set; } | ||
| [YamlMember(Alias = "should_not_activate")] | ||
| public List<string>? ShouldNotActivate { get; set; } | ||
| } | ||
|
|
||
| internal sealed class RawScenario | ||
|
|
||
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
In
--selectivity-testmode this code runs only after the earlierif (skill.EvalConfig.Scenarios.Count == 0) ... return null;guard, so selectivity testing is currently impossible for a skill that provides only selectivity prompts (or has an emptyscenarios:list). If selectivity-only eval.yaml files are intended, move the scenario-count skip below the selectivity branch (or only enforce scenarios when not in selectivity mode).