Skill Validation #3614

Workflow file for this run

.github/workflows/skill-validation.yml at b0ea772

	# Skill & agent validation for PRs touching .github/skills/ or .github/agents/.
	#
	# Two modes:
	# 1. Static checks — run automatically on every PR that touches skills/agents.
	# 2. LLM evaluation — runs automatically for contributor PRs, or can be
	# triggered by a repo contributor posting "/evaluate-skills" on any PR.
	# Requires COPILOT_GITHUB_TOKEN secret (Copilot API access).
	#
	# Trigger model:
	# - pull_request_target: runs in the base repo context with full permissions
	# and secret access, even for fork PRs. Workflow YAML is always from the
	# default branch (not the PR), ensuring security.
	# - issue_comment (/evaluate-skills): same security model as pull_request_target.
	# Always runs workflow YAML from the default branch.
	#
	# Security model:
	# - Workflow YAML: always from the default branch (enforced by both triggers)
	# - Validator binary: downloaded from dotnet/skills releases (trusted)
	# - Skill/test content: checked out from the PR via sparse-checkout
	# (only .github/skills and .github/agents — markdown/YAML data files)
	# - No PR code is compiled or executed
	# - LLM evaluation: only runs for PRs from contributors with write+ access,
	# or when explicitly triggered via /evaluate-skills by a contributor

	name: Skill Validation

	on:
	pull_request_target:
	types: [opened, synchronize, reopened]
	paths:
	- '.github/skills/**'
	- '.github/agents/**'
	- '.github/plugin.json'
	- '.github/workflows/skill-validation.yml'

	issue_comment:
	types: [created]

	workflow_dispatch:

	concurrency:
	group: >-
	skill-validation-${{
	github.event_name == 'issue_comment'
	&& startsWith(github.event.comment.body, '/evaluate-skills')
	&& format('eval-{0}', github.event.issue.number)
	\|\| github.event_name == 'issue_comment'
	&& format('noop-{0}-{1}', github.event.issue.number, github.event.comment.id)
	\|\| github.event_name == 'pull_request_target'
	&& format('pr-{0}', github.event.pull_request.number)
	\|\| github.run_id
	}}
	cancel-in-progress: true

	permissions:
	contents: read
	pull-requests: write
	issues: write
	statuses: write
	checks: write

	env:
	VALIDATOR_CACHE_PREFIX: skill-validator-linux-x64

	jobs:
	# ==========================================================================
	# PR GATE (pull_request_target)
	# Determine PR source, author permissions, and changed files.
	# ==========================================================================
	pr-gate:
	name: PR gate
	if: github.event_name == 'pull_request_target'
	runs-on: ubuntu-latest
	permissions:
	contents: read
	pull-requests: read
	outputs:
	head_sha: ${{ github.event.pull_request.head.sha }}
	head_repo: ${{ github.event.pull_request.head.repo.full_name }}
	pr_number: ${{ github.event.pull_request.number }}
	is_contributor: ${{ steps.perms.outputs.is_contributor }}
	is_fork: ${{ steps.info.outputs.is_fork }}
	changed_skills: ${{ steps.discover.outputs.changed_skills }}
	has_skill_changes: ${{ steps.discover.outputs.has_skill_changes }}
	has_agent_changes: ${{ steps.discover.outputs.has_agent_changes }}
	steps:
	- name: Determine fork status
	id: info
	env:
	HEAD: ${{ github.event.pull_request.head.repo.full_name }}
	BASE: ${{ github.event.pull_request.base.repo.full_name }}
	run: \|
	IS_FORK=$([[ "$HEAD" != "$BASE" ]] && echo true \|\| echo false)
	echo "is_fork=$IS_FORK" >> $GITHUB_OUTPUT
	echo "PR from $HEAD → $BASE (fork=$IS_FORK)"

	- name: Check PR author permissions
	id: perms
	env:
	GH_TOKEN: ${{ github.token }}
	AUTHOR: ${{ github.event.pull_request.user.login }}
	run: \|
	AUTHOR="$AUTHOR"
	PERMISSION=$(gh api "repos/${{ github.repository }}/collaborators/${AUTHOR}/permission" \
	--jq '.permission' 2>/dev/null \|\| echo "none")
	echo "PR author $AUTHOR has permission: $PERMISSION"
	if [[ "$PERMISSION" == "admin" \|\| "$PERMISSION" == "write" \|\| "$PERMISSION" == "maintain" ]]; then
	echo "is_contributor=true" >> $GITHUB_OUTPUT
	else
	echo "is_contributor=false" >> $GITHUB_OUTPUT
	fi

	- name: Discover changed files
	id: discover
	env:
	GH_TOKEN: ${{ github.token }}
	PR_NUMBER: ${{ github.event.pull_request.number }}
	run: \|
	CHANGED=$(gh api "repos/${{ github.repository }}/pulls/${PR_NUMBER}/files" \
	--paginate --jq '.[].filename')

	SKILL_DIRS=$(echo "$CHANGED" \| grep '^\.github/skills/' \| \
	sed 's\|^\.github/skills/$[^/]$/.\|\1\|' \| sort -u \|\| true)
	AGENT_FILES=$(echo "$CHANGED" \| grep '^\.github/agents/' \|\| true)

	echo "has_skill_changes=$( [ -n "$SKILL_DIRS" ] && echo true \|\| echo false )" >> $GITHUB_OUTPUT
	echo "has_agent_changes=$( [ -n "$AGENT_FILES" ] && echo true \|\| echo false )" >> $GITHUB_OUTPUT

	DELIM="EOF_$(openssl rand -hex 8)"
	echo "changed_skills<<$DELIM" >> $GITHUB_OUTPUT
	echo "$SKILL_DIRS" >> $GITHUB_OUTPUT
	echo "$DELIM" >> $GITHUB_OUTPUT

	echo "Changed skills: $SKILL_DIRS"
	echo "Changed agents: $AGENT_FILES"

	# ==========================================================================
	# SLASH COMMAND GATE (/evaluate-skills)
	# ==========================================================================
	slash-gate:
	name: Gate (/evaluate-skills)
	if: >-
	github.event_name == 'issue_comment' &&
	github.event.issue.pull_request &&
	startsWith(github.event.comment.body, '/evaluate-skills')
	runs-on: ubuntu-latest
	outputs:
	head_sha: ${{ steps.pr.outputs.head_sha }}
	head_repo: ${{ steps.pr.outputs.head_repo }}
	pr_number: ${{ steps.pr.outputs.pr_number }}
	steps:
	- name: Check commenter permissions
	env:
	GH_TOKEN: ${{ github.token }}
	COMMENTER: ${{ github.event.comment.user.login }}
	run: \|
	PERMISSION=$(gh api "repos/${{ github.repository }}/collaborators/${COMMENTER}/permission" \
	--jq '.permission')
	echo "Commenter $COMMENTER has permission: $PERMISSION"
	if [[ "$PERMISSION" != "admin" && "$PERMISSION" != "write" && "$PERMISSION" != "maintain" ]]; then
	echo "::error::User does not have write access"
	exit 1
	fi

	- name: Get PR details
	id: pr
	env:
	GH_TOKEN: ${{ github.token }}
	ISSUE_NUMBER: ${{ github.event.issue.number }}
	run: \|
	PR_NUMBER="$ISSUE_NUMBER"
	PR_DATA=$(gh api "repos/${{ github.repository }}/pulls/${PR_NUMBER}")
	HEAD_SHA=$(echo "$PR_DATA" \| jq -r '.head.sha')
	HEAD_REPO=$(echo "$PR_DATA" \| jq -r '.head.repo.full_name')
	echo "head_sha=${HEAD_SHA}" >> $GITHUB_OUTPUT
	echo "head_repo=${HEAD_REPO}" >> $GITHUB_OUTPUT
	echo "pr_number=${PR_NUMBER}" >> $GITHUB_OUTPUT

	- name: Add reaction to comment
	env:
	GH_TOKEN: ${{ github.token }}
	COMMENT_ID: ${{ github.event.comment.id }}
	run: \|
	gh api "repos/${{ github.repository }}/issues/comments/${COMMENT_ID}/reactions" \
	-X POST -f content='eyes' \|\| true

	- name: Set pending commit status
	continue-on-error: true
	env:
	GH_TOKEN: ${{ github.token }}
	run: \|
	gh api "repos/${{ github.repository }}/statuses/${{ steps.pr.outputs.head_sha }}" \
	-f state=pending \
	-f context="skill-validation" \
	-f description="Skill evaluation in progress..." \
	-f target_url="${{ github.server_url }}/${{ github.repository }}/actions/runs/${{ github.run_id }}"

	# ==========================================================================
	# STATIC VALIDATION
	# Always runs for PRs (all types) and slash-commands.
	# ==========================================================================
	static-check:
	name: Static validation
	needs: [pr-gate, slash-gate]
	if: >-
	always() && !cancelled() && (
	needs.pr-gate.result == 'success' \|\|
	needs.slash-gate.result == 'success' \|\|
	github.event_name == 'workflow_dispatch'
	)
	runs-on: ubuntu-latest
	permissions:
	contents: read
	outputs:
	exit_code: ${{ steps.check.outputs.exit_code }}
	steps:
	- name: Checkout PR content
	uses: actions/checkout@v4
	with:
	repository: ${{ needs.pr-gate.outputs.head_repo \|\| needs.slash-gate.outputs.head_repo \|\| github.repository }}
	ref: ${{ needs.pr-gate.outputs.head_sha \|\| needs.slash-gate.outputs.head_sha \|\| '' }}
	sparse-checkout: \|
	.github/skills
	.github/agents
	.github/plugin.json
	persist-credentials: false

	# ── Download & cache skill-validator ──────────────────────────
	- name: Get cache key date
	id: cache-date
	run: echo "date=$(date +%Y-%m-%d)" >> "$GITHUB_OUTPUT"

	- name: Restore skill-validator from cache
	id: cache-sv
	uses: actions/cache/restore@v4
	with:
	path: skill-validator-bin
	key: ${{ env.VALIDATOR_CACHE_PREFIX }}-${{ steps.cache-date.outputs.date }}
	restore-keys: \|
	${{ env.VALIDATOR_CACHE_PREFIX }}-

	- name: Download skill-validator
	if: steps.cache-sv.outputs.cache-hit != 'true'
	run: \|
	mkdir -p skill-validator-bin
	curl -fsSL --retry 3 --retry-all-errors -o skill-validator.tar.gz \
	https://github.com/dotnet/skills/releases/download/skill-validator-nightly/skill-validator-linux-x64.tar.gz
	tar -xzf skill-validator.tar.gz -C skill-validator-bin
	if [ ! -f skill-validator-bin/skill-validator ]; then
	echo "::error::skill-validator binary not found after extraction"
	exit 1
	fi
	chmod +x skill-validator-bin/skill-validator

	- name: Save skill-validator to cache
	if: steps.cache-sv.outputs.cache-hit != 'true'
	uses: actions/cache/save@v4
	with:
	path: skill-validator-bin
	key: ${{ env.VALIDATOR_CACHE_PREFIX }}-${{ steps.cache-date.outputs.date }}

	# ── Run skill-validator check ─────────────────────────────────
	- name: Run skill-validator check
	id: check
	shell: bash
	env:
	CHANGED_SKILLS: ${{ needs.pr-gate.outputs.changed_skills }}
	run: \|
	rc=0

	if [ -d .github/skills ]; then
	echo "::group::Validate skills"

	# For PR path: validate only changed skills for efficiency
	# For slash-command or workflow_dispatch: validate all
	PR_GATE="${{ needs.pr-gate.result }}"
	if [[ "$PR_GATE" == "success" ]]; then
	SKILLS_ARG=""
	while IFS= read -r skill; do
	[ -z "$skill" ] && continue
	SKILL_DIR=".github/skills/$skill"
	if [ -d "$SKILL_DIR" ]; then
	SKILLS_ARG="$SKILLS_ARG --skills $SKILL_DIR"
	fi
	done <<< "$CHANGED_SKILLS"
	# Fallback to all if no specific skills found
	[ -z "$SKILLS_ARG" ] && SKILLS_ARG="--skills .github/skills"
	else
	SKILLS_ARG="--skills .github/skills"
	fi

	set +e
	skill-validator-bin/skill-validator check $SKILLS_ARG --allow-repo-traversal --verbose 2>&1 \| tee skill-check-skills.txt
	skills_rc=${PIPESTATUS[0]}
	set -e
	echo "::endgroup::"
	if [ "$skills_rc" -ne 0 ]; then rc=1; fi
	fi

	if [ -d .github/agents ]; then
	echo "::group::Validate agents"
	set +e
	skill-validator-bin/skill-validator check --agents .github/agents --verbose 2>&1 \| tee skill-check-agents.txt
	agents_rc=${PIPESTATUS[0]}
	set -e
	echo "::endgroup::"
	if [ "$agents_rc" -ne 0 ]; then rc=1; fi
	fi

	cat skill-check-skills.txt skill-check-agents.txt > sv-output.txt 2>/dev/null \|\| true
	echo "exit_code=$rc" >> "$GITHUB_OUTPUT"

	# Step summary
	{
	echo "## skill-validator check"
	echo ""
	skill_count=$(find .github/skills -mindepth 1 -maxdepth 1 -type d 2>/dev/null \| wc -l)
	agent_count=$(find .github/agents -name '*.agent.md' 2>/dev/null \| wc -l)
	if [ "$rc" -eq 0 ]; then
	echo "All checks passed."
	echo ""
	echo "Validated ${skill_count} skill(s) and ${agent_count} agent(s)."
	else
	for f in skill-check-skills.txt skill-check-agents.txt; do
	if [ -f "$f" ]; then
	echo "### ${f}"
	echo '```'
	head -n 200 "$f"
	echo '```'
	echo ""
	fi
	done
	fi
	} >> "$GITHUB_STEP_SUMMARY"

	# ── Upload results for comment job ────────────────────────────
	- name: Save results artifact
	if: always()
	run: \|
	mkdir -p sv-results
	skill_count=$(find .github/skills -mindepth 1 -maxdepth 1 -type d 2>/dev/null \| wc -l)
	agent_count=$(find .github/agents -name '*.agent.md' 2>/dev/null \| wc -l)
	echo "$skill_count" > sv-results/skill-count.txt
	echo "$agent_count" > sv-results/agent-count.txt
	echo "${{ steps.check.outputs.exit_code }}" > sv-results/exit-code.txt
	if [ -f sv-output.txt ]; then
	cp sv-output.txt sv-results/sv-output.txt
	fi

	- name: Upload results
	if: always()
	uses: actions/upload-artifact@v4
	with:
	name: static-check-results
	path: sv-results/
	retention-days: 1

	- name: Fail if checks failed
	if: steps.check.outputs.exit_code != '0'
	run: exit 1

	# ==========================================================================
	# DISCOVER EVALUATABLE SKILLS
	# Only runs when LLM eval should happen (contributor PR or slash-command).
	# ==========================================================================
	discover-eval:
	name: Discover skills to evaluate
	needs: [pr-gate, slash-gate]
	if: >-
	always() && !cancelled() && (
	(needs.pr-gate.result == 'success' && needs.pr-gate.outputs.is_contributor == 'true') \|\|
	needs.slash-gate.result == 'success'
	)
	runs-on: ubuntu-latest
	permissions:
	contents: read
	outputs:
	entries: ${{ steps.find.outputs.entries }}
	has_entries: ${{ steps.find.outputs.has_entries }}
	steps:
	- name: Checkout PR content
	uses: actions/checkout@v4
	with:
	repository: ${{ needs.pr-gate.outputs.head_repo \|\| needs.slash-gate.outputs.head_repo }}
	ref: ${{ needs.pr-gate.outputs.head_sha \|\| needs.slash-gate.outputs.head_sha }}
	sparse-checkout: \|
	.github/skills
	.github/plugin.json
	persist-credentials: false

	- name: Discover changed files
	id: changed
	env:
	GH_TOKEN: ${{ github.token }}
	PR_NUMBER: ${{ needs.pr-gate.outputs.pr_number \|\| needs.slash-gate.outputs.pr_number }}
	run: \|
	CHANGED=$(gh api "repos/${{ github.repository }}/pulls/${PR_NUMBER}/files" \
	--paginate --jq '.[].filename')

	SKILL_DIRS=$(echo "$CHANGED" \| grep '^\.github/skills/' \| \
	sed 's\|^\.github/skills/$[^/]$/.\|\1\|' \| sort -u \|\| true)

	# Check for workflow changes (evaluate all skills with tests)
	WORKFLOW_CHANGES=$(echo "$CHANGED" \| grep '^\.github/workflows/skill-validation' \|\| true)

	DELIM="EOF_$(openssl rand -hex 8)"
	echo "skill_dirs<<$DELIM" >> $GITHUB_OUTPUT
	echo "$SKILL_DIRS" >> $GITHUB_OUTPUT
	echo "$DELIM" >> $GITHUB_OUTPUT

	if [ -n "$WORKFLOW_CHANGES" ]; then
	echo "eval_all=true" >> $GITHUB_OUTPUT
	else
	echo "eval_all=false" >> $GITHUB_OUTPUT
	fi

	- name: Find skills with eval tests
	id: find
	shell: pwsh
	env:
	SKILL_DIRS: ${{ steps.changed.outputs.skill_dirs }}
	EVAL_ALL: ${{ steps.changed.outputs.eval_all }}
	run: \|
	$entries = @()
	$evalAll = $env:EVAL_ALL -eq "true"

	if ($evalAll) {
	Write-Host "Workflow changes detected - evaluating all skills with tests"
	$skills = @(Get-ChildItem -Path ".github/skills" -Directory \|
	Select-Object -ExpandProperty Name)
	} else {
	$raw = $env:SKILL_DIRS
	$skills = @($raw.Split("`n", [StringSplitOptions]::RemoveEmptyEntries) \|
	ForEach-Object { $_.Trim() } \|
	Where-Object { $_ })
	}

	foreach ($skill in $skills) {
	$evalFile = ".github/skills/$skill/tests/eval.yaml"
	if (Test-Path $evalFile) {
	Write-Host " -> $skill has eval tests"
	$entries += @{
	name = $skill
	skills_path = ".github/skills/$skill"
	tests_path = ".github/skills/$skill/tests"
	}
	} else {
	Write-Host " -> $skill has NO eval tests (static-only)"
	}
	}

	if ($entries.Count -eq 0) {
	Write-Host "No skills with eval tests to evaluate"
	echo "entries=[]" >> $env:GITHUB_OUTPUT
	echo "has_entries=false" >> $env:GITHUB_OUTPUT
	} else {
	$json = $entries \| ConvertTo-Json -Compress -AsArray
	Write-Host "Entries to evaluate: $json"
	echo "entries=$json" >> $env:GITHUB_OUTPUT
	echo "has_entries=true" >> $env:GITHUB_OUTPUT
	}

	# ==========================================================================
	# LLM EVALUATION (matrix)
	# Runs skill-validator evaluate for each changed skill with eval tests.
	# ==========================================================================
	evaluate:
	name: evaluate (${{ matrix.entry.name }})
	needs: [pr-gate, slash-gate, discover-eval]
	if: >-
	always() && !cancelled() &&
	needs.discover-eval.result == 'success' &&
	needs.discover-eval.outputs.has_entries == 'true'
	runs-on: ubuntu-latest
	permissions:
	contents: read
	timeout-minutes: 120
	strategy:
	fail-fast: false
	matrix:
	entry: ${{ fromJson(needs.discover-eval.outputs.entries \|\| '[]') }}
	steps:
	- name: Checkout PR content
	uses: actions/checkout@v4
	with:
	repository: ${{ needs.pr-gate.outputs.head_repo \|\| needs.slash-gate.outputs.head_repo }}
	ref: ${{ needs.pr-gate.outputs.head_sha \|\| needs.slash-gate.outputs.head_sha }}
	sparse-checkout: \|
	.github/skills
	.github/plugin.json
	persist-credentials: false

	# ── Prepare test directory layout ─────────────────────────────
	# skill-validator evaluate expects tests at <tests-dir>/<skill>/eval.yaml
	# but maui keeps them co-located at .github/skills/<skill>/tests/eval.yaml.
	# Create a flat tests directory by copying files to match the expected layout.
	- name: Prepare test directory
	run: \|
	mkdir -p eval-tests
	for dir in .github/skills/*/tests; do
	[ -d "$dir" ] \|\| continue
	[ -f "$dir/eval.yaml" ] \|\| continue
	skill=$(basename $(dirname "$dir"))
	mkdir -p "eval-tests/$skill"
	# Copy eval.yaml and any fixture files
	cp -r "$dir"/* "eval-tests/$skill/"
	done
	echo "Prepared test directories:"
	find eval-tests -name 'eval.yaml' \| sort

	# ── Download & cache skill-validator ──────────────────────────
	- name: Get cache key date
	id: cache-date
	run: echo "date=$(date +%Y-%m-%d)" >> "$GITHUB_OUTPUT"

	- name: Restore skill-validator from cache
	id: cache-sv
	uses: actions/cache/restore@v4
	with:
	path: skill-validator-bin
	key: ${{ env.VALIDATOR_CACHE_PREFIX }}-${{ steps.cache-date.outputs.date }}
	restore-keys: \|
	${{ env.VALIDATOR_CACHE_PREFIX }}-

	- name: Download skill-validator
	if: steps.cache-sv.outputs.cache-hit != 'true'
	run: \|
	mkdir -p skill-validator-bin
	curl -fsSL --retry 3 --retry-all-errors -o skill-validator.tar.gz \
	https://github.com/dotnet/skills/releases/download/skill-validator-nightly/skill-validator-linux-x64.tar.gz
	tar -xzf skill-validator.tar.gz -C skill-validator-bin
	if [ ! -f skill-validator-bin/skill-validator ]; then
	echo "::error::skill-validator binary not found after extraction"
	exit 1
	fi
	chmod +x skill-validator-bin/skill-validator

	- name: Save skill-validator to cache
	if: steps.cache-sv.outputs.cache-hit != 'true'
	uses: actions/cache/save@v4
	with:
	path: skill-validator-bin
	key: ${{ env.VALIDATOR_CACHE_PREFIX }}-${{ steps.cache-date.outputs.date }}

	# ── Select Copilot token ──────────────────────────────────────
	- name: Select Copilot token
	id: select-token
	env:
	TOKEN_1: ${{ secrets.COPILOT_GITHUB_TOKEN }}
	TOKEN_2: ${{ secrets.COPILOT_GITHUB_TOKEN_2 }}
	TOKEN_3: ${{ secrets.COPILOT_GITHUB_TOKEN_3 }}
	run: \|
	TOKENS=()
	NAMES=()
	for i in 1 2 3; do
	var="TOKEN_$i"
	val="${!var}"
	if [ -n "$val" ]; then
	TOKENS+=("$val")
	if [ "$i" -eq 1 ]; then
	NAMES+=("COPILOT_GITHUB_TOKEN")
	else
	NAMES+=("COPILOT_GITHUB_TOKEN_$i")
	fi
	fi
	done

	if [ ${#TOKENS[@]} -eq 0 ]; then
	echo "::error::No COPILOT_GITHUB_TOKEN secrets are configured"
	exit 1
	fi

	JOB_INDEX="${{ strategy.job-index }}"
	RUN_ID="${{ github.run_id }}"
	if [ -n "$JOB_INDEX" ] && [ ${#TOKENS[@]} -gt 1 ]; then
	IDX=$(( (JOB_INDEX + RUN_ID) % ${#TOKENS[@]} ))
	elif [ -n "$JOB_INDEX" ]; then
	IDX=0
	else
	IDX=$((RANDOM % ${#TOKENS[@]}))
	fi
	echo "Selected ${NAMES[$IDX]} (1 of ${#TOKENS[@]} available tokens, job-index=${JOB_INDEX:-random})"

	echo "::add-mask::${TOKENS[$IDX]}"
	echo "token=${TOKENS[$IDX]}" >> $GITHUB_OUTPUT

	# ── Run LLM evaluation ───────────────────────────────────────
	- name: Run skill-validator evaluate
	id: eval-run
	env:
	COPILOT_TOKEN: ${{ steps.select-token.outputs.token }}
	RESULTS_PATH: eval-results/${{ matrix.entry.name }}
	SKILLS_PATH: ${{ matrix.entry.skills_path }}
	run: \|
	# skill-validator reads GITHUB_TOKEN for API access
	export GITHUB_TOKEN="$COPILOT_TOKEN"

	ARGS="--verdict-warn-only --verbose"
	ARGS="$ARGS --results-dir $RESULTS_PATH --reporter console --reporter json --reporter markdown"
	ARGS="$ARGS --model claude-opus-4.6"
	ARGS="$ARGS --judge-model claude-opus-4.6"
	ARGS="$ARGS --runs 3"
	ARGS="$ARGS --parallel-skills 2"
	ARGS="$ARGS --parallel-scenarios 3"
	ARGS="$ARGS --parallel-runs 3"

	set +e
	skill-validator-bin/skill-validator evaluate $ARGS \
	--tests-dir eval-tests \
	"$SKILLS_PATH"
	EVAL_RC=$?
	set -e

	echo "eval_exit_code=$EVAL_RC" >> $GITHUB_OUTPUT

	# Determine actual pass/fail from results.json (the source of truth)
	RESULTS_JSON=$(find "$RESULTS_PATH" -name 'results.json' -type f \| head -1)
	if [ -n "$RESULTS_JSON" ]; then
	ALL_PASSED=$(jq 'if .verdicts \| length == 0 then false else all(.verdicts[]; .passed) end' "$RESULTS_JSON")
	echo "eval_passed=$ALL_PASSED" >> $GITHUB_OUTPUT
	else
	echo "eval_passed=false" >> $GITHUB_OUTPUT
	fi

	- name: Upload results
	if: always()
	uses: actions/upload-artifact@v4
	with:
	name: skill-eval-results-${{ matrix.entry.name }}
	path: eval-results/${{ matrix.entry.name }}/
	include-hidden-files: true
	retention-days: 14

	# ==========================================================================
	# POST PR COMMENT
	# Consolidated results (static + eval) posted directly to the PR.
	# pull_request_target has write permissions, so no separate workflow needed.
	# ==========================================================================
	comment:
	name: Post results comment
	needs: [pr-gate, slash-gate, static-check, discover-eval, evaluate]
	if: >-
	always() && !cancelled() && (
	needs.pr-gate.result == 'success' \|\|
	needs.slash-gate.result == 'success'
	)
	runs-on: ubuntu-latest
	permissions:
	pull-requests: write
	issues: write
	outputs:
	eval_passed: ${{ steps.post-comment.outputs.eval_passed }}
	steps:
	- name: Download static check results
	uses: actions/download-artifact@v4
	with:
	name: static-check-results
	path: static-results/
	continue-on-error: true

	- name: Download eval result artifacts
	if: needs.evaluate.result == 'success' \|\| needs.evaluate.result == 'failure'
	uses: actions/download-artifact@v4
	with:
	pattern: skill-eval-results-*
	path: eval-results/
	merge-multiple: false
	continue-on-error: true

	- name: Post comment
	id: post-comment
	uses: actions/github-script@v7
	env:
	PR_NUMBER: ${{ needs.pr-gate.outputs.pr_number \|\| needs.slash-gate.outputs.pr_number }}
	STATIC_RESULT: ${{ needs.static-check.result }}
	EVAL_RESULT: ${{ needs.evaluate.result }}
	HAS_ENTRIES: ${{ needs.discover-eval.outputs.has_entries }}
	DISCOVER_RESULT: ${{ needs.discover-eval.result }}
	IS_CONTRIBUTOR: ${{ needs.pr-gate.outputs.is_contributor \|\| 'true' }}
	with:
	script: \|
	const fs = require('fs');
	const path = require('path');

	const prNumber = parseInt(process.env.PR_NUMBER, 10);
	const runUrl = `${context.serverUrl}/${context.repo.owner}/${context.repo.repo}/actions/runs/${context.runId}`;
	const marker = '<!-- skill-validation-results -->';

	const staticResult = process.env.STATIC_RESULT;
	const evalResult = process.env.EVAL_RESULT;
	const hasEntries = process.env.HAS_ENTRIES === 'true';
	const discoverResult = process.env.DISCOVER_RESULT;
	const isContributor = process.env.IS_CONTRIBUTOR === 'true';
	const evalRan = discoverResult === 'success';

	const lines = [marker, '## 🔍 Skill Validation Results', ''];

	// ── Static check section ──────────────────────────────
	let staticOutput = '';
	try {
	if (fs.existsSync('static-results/sv-output.txt')) {
	staticOutput = fs.readFileSync('static-results/sv-output.txt', 'utf8')
	.replace(/\x1b\[[0-9;]*m/g, '').trim();
	}
	} catch (e) { /* ignore */ }

	const exitCode = (() => {
	try { return fs.readFileSync('static-results/exit-code.txt', 'utf8').trim(); }
	catch { return '?'; }
	})();
	const skillCount = (() => {
	try { return fs.readFileSync('static-results/skill-count.txt', 'utf8').trim(); }
	catch { return '?'; }
	})();
	const agentCount = (() => {
	try { return fs.readFileSync('static-results/agent-count.txt', 'utf8').trim(); }
	catch { return '?'; }
	})();

	if (staticResult === 'success') {
	lines.push('### ✅ Static Checks Passed');
	} else if (staticResult === 'failure') {
	lines.push('### ❌ Static Checks Failed');
	} else {
	lines.push(`### ⚠️ Static Checks: ${staticResult}`);
	}
	lines.push(`Skills checked: ${skillCount} \| Agents checked: ${agentCount}`);
	lines.push('');

	if (staticOutput) {
	const findings = staticOutput.split('\n')
	.map(l => l.trim())
	.filter(l => /^[❌⚠ℹ]/.test(l))
	.slice(0, 10);

	if (findings.length > 0) {
	lines.push('\| Level \| Finding \|');
	lines.push('\|---\|---\|');
	for (const line of findings) {
	const level = line.startsWith('❌') ? '❌'
	: line.startsWith('⚠') ? '⚠️'
	: 'ℹ️';
	const text = line.replace(/^[❌⚠ℹ️\s]+/, '').replace(/\\|/g, '\\\|');
	lines.push(`\| ${level} \| ${text} \|`);
	}
	lines.push('');
	}

	lines.push('<details>');
	lines.push('<summary>Full validator output</summary>');
	lines.push('');
	lines.push('```text');
	lines.push(staticOutput.replace(/```/g, '` ` `'));
	lines.push('```');
	lines.push('');
	lines.push('</details>');
	lines.push('');
	}

	// ── Parse eval results from JSON ──────────────────────
	// Read results.json files from downloaded artifacts to determine
	// actual pass/fail (the source of truth, not the job exit code
	// which uses --verdict-warn-only).
	let allVerdicts = [];
	let evalPassed = true;
	let hasResults = false;
	const footnotes = [];

	if (fs.existsSync('eval-results')) {
	try {
	const resultDirs = fs.readdirSync('eval-results').filter(d =>
	fs.statSync(path.join('eval-results', d)).isDirectory()
	);

	for (const dir of resultDirs) {
	const dirPath = path.join('eval-results', dir);
	// Recursively find results.json
	const allFiles = [];
	function walkDir(d) {
	for (const f of fs.readdirSync(d)) {
	const fp = path.join(d, f);
	if (fs.statSync(fp).isDirectory()) walkDir(fp);
	else allFiles.push(path.relative(dirPath, fp));
	}
	}
	walkDir(dirPath);

	const jsonFile = allFiles.find(f => f.endsWith('results.json'));
	if (jsonFile) {
	hasResults = true;
	const data = JSON.parse(
	fs.readFileSync(path.join(dirPath, jsonFile), 'utf8')
	);
	if (data.verdicts && data.verdicts.length > 0) {
	allVerdicts.push(...data.verdicts);
	for (const v of data.verdicts) {
	if (!v.passed) evalPassed = false;
	}
	} else {
	evalPassed = false; // no verdicts = not passed
	}
	}
	}
	} catch (e) {
	console.log('Error reading eval results JSON:', e.message);
	}
	}

	// ── LLM evaluation section ────────────────────────────
	if (!evalRan && !isContributor) {
	lines.push('### ⏭️ LLM Evaluation: Skipped');
	lines.push('');
	lines.push('> 💡 LLM evaluation was not run for this external PR.');
	lines.push('> A repository contributor can post `/evaluate-skills` on this PR to trigger full evaluation.');
	lines.push('');
	} else if (!hasEntries) {
	lines.push('### ⏭️ LLM Evaluation: Skipped');
	lines.push('_No changed skills with eval tests found._');
	lines.push('');
	} else if (hasResults) {
	// Use actual results from JSON to determine status
	if (evalPassed) {
	lines.push('### ✅ LLM Evaluation Passed');
	} else {
	lines.push('### ❌ LLM Evaluation Failed');
	}
	const passedCount = allVerdicts.filter(v => v.passed).length;
	lines.push(`${passedCount}/${allVerdicts.length} skill(s) passed validation`);
	lines.push('');

	// ── Build results table ─────────────────────────────
	if (allVerdicts.length > 0) {
	lines.push('\| Skill \| Scenario \| Baseline \| Skilled \| Verdict \|');
	lines.push('\|-------\|----------\|----------\|---------\|---------\|');

	let fnIndex = 0;
	for (const verdict of allVerdicts) {
	const scenarios = verdict.scenarios \|\| [];
	for (const sc of scenarios) {
	const baseScore = sc.baseline?.judgeResult?.overallScore;
	const isolatedScore = sc.skilledIsolated?.judgeResult?.overallScore;
	const pluginScore = sc.skilledPlugin?.judgeResult?.overallScore;

	// Format scores
	const baseStr = baseScore != null ? `${baseScore.toFixed(1)}/5` : '—';

	// Pick the best skilled score (isolated or plugin)
	let skilledStr;
	if (isolatedScore != null && pluginScore != null) {
	skilledStr = `${isolatedScore.toFixed(1)}/5 (iso) · ${pluginScore.toFixed(1)}/5 (plug)`;
	} else if (isolatedScore != null) {
	skilledStr = `${isolatedScore.toFixed(1)}/5`;
	} else if (pluginScore != null) {
	skilledStr = `${pluginScore.toFixed(1)}/5`;
	} else {
	skilledStr = '—';
	}

	// Timeout indicator
	const timeoutFlag = sc.timedOut ? ' ⏳' : '';

	// Verdict icon — per-scenario: improvement >= 0 means not regressed
	const improvement = sc.improvementScore \|\| 0;
	const scenarioIcon = improvement >= 0 ? '✅' : '⚠️';

	// Footnote for high variance or timeout
	let footRef = '';
	if (sc.highVariance \|\| sc.timedOut) {
	fnIndex++;
	const parts = [];
	if (sc.highVariance) parts.push(`High run-to-run variance (CV=${(sc.varianceCV \|\| 0).toFixed(2)})`);
	if (sc.timedOut) parts.push(`Timeout at ${sc.timeoutSeconds \|\| '?'}s`);
	footRef = ` <a href="#user-content-fn-${fnIndex}" id="ref-${fnIndex}">[${fnIndex}]</a>`;
	footnotes.push(`<a href="#user-content-ref-${fnIndex}" id="fn-${fnIndex}"><strong>[${fnIndex}]</strong></a> ${parts.join('. ')}`);
	}

	const safeSkillName = (verdict.skillName \|\| '').replace(/\\|/g, '\\\|');
	const safeScenarioName = (sc.scenarioName \|\| '').replace(/\\|/g, '\\\|');
	lines.push(`\| ${safeSkillName} \| ${safeScenarioName} \| ${baseStr}${timeoutFlag} \| ${skilledStr}${timeoutFlag} \| ${scenarioIcon}${footRef} \|`);
	}
	}
	lines.push('');

	// Overall verdict line per skill
	for (const verdict of allVerdicts) {
	const icon = verdict.passed ? '✅' : '❌';
	const reason = (verdict.reason \|\| '').replace(/\\|/g, '\\\|');
	const safeSkillNameSummary = (verdict.skillName \|\| '').replace(/\\|/g, '\\\|');
	lines.push(`${icon} ${safeSkillNameSummary}: ${reason}`);
	lines.push('');
	}

	// Footnotes
	if (footnotes.length > 0) {
	for (const fn of footnotes) {
	lines.push(fn);
	}
	lines.push('');
	}

	// Timeout warning
	const hasTimeout = allVerdicts.some(v =>
	(v.scenarios \|\| []).some(s => s.timedOut)
	);
	if (hasTimeout) {
	lines.push('> ⏳ timeout — run(s) hit the scenario timeout limit; scoring may be impacted');
	lines.push('');
	}
	}
	} else if (evalResult === 'success') {
	lines.push('### ✅ LLM Evaluation Passed');
	lines.push('');
	} else if (evalResult === 'failure') {
	lines.push('### ❌ LLM Evaluation Failed');
	lines.push('');
	} else if (evalResult === 'skipped') {
	lines.push('### ⏭️ LLM Evaluation: Skipped');
	lines.push('');
	} else {
	lines.push(`### ⚠️ LLM Evaluation: ${evalResult}`);
	lines.push('');
	}

	// Detailed judge reports in collapsible sections
	if (fs.existsSync('eval-results')) {
	try {
	const resultDirs = fs.readdirSync('eval-results').filter(d =>
	fs.statSync(path.join('eval-results', d)).isDirectory()
	);

	for (const dir of resultDirs) {
	const skillName = dir.replace('skill-eval-results-', '');
	const dirPath = path.join('eval-results', dir);
	const allFiles = [];
	function walkDir2(d) {
	for (const f of fs.readdirSync(d)) {
	const fp = path.join(d, f);
	if (fs.statSync(fp).isDirectory()) walkDir2(fp);
	else allFiles.push(path.relative(dirPath, fp));
	}
	}
	walkDir2(dirPath);

	// Include per-scenario judge reports (not summary.md which duplicates the table)
	const mdFiles = allFiles.filter(f =>
	f.endsWith('.md') && !f.endsWith('summary.md')
	);
	for (const mdFile of mdFiles) {
	const mdContent = fs.readFileSync(
	path.join(dirPath, mdFile), 'utf8'
	).trim();
	if (mdContent.length > 0) {
	const scenarioName = path.basename(mdFile, '.md');
	lines.push(`<details>`);
	lines.push(`<summary>📊 ${skillName} / ${scenarioName}</summary>`);
	lines.push('');
	lines.push(mdContent.replace(/```/g, '` ` `').replace(/<\/details>/gi, '</details>'));
	lines.push('');
	lines.push('</details>');
	lines.push('');
	}
	}
	}
	} catch (e) {
	console.log('Error reading eval result details:', e.message);
	}
	}

	// ── Investigation prompt for failures ─────────────────
	// When any evaluated skill failed, build a copy-paste prompt
	// that tells the user how to download artifacts and investigate
	// with their AI coding agent (same pattern as dotnet/skills).
	let investigatePrompt = '';
	if (hasResults && !evalPassed) {
	const runId = context.runId;
	const repo = `${context.repo.owner}/${context.repo.repo}`;
	investigatePrompt = [
	'',
	'> To investigate failures, paste this to your AI coding agent:',
	'>',
	`> _For PR #${prNumber} in ${repo}, download eval artifacts with ` +
	`\`gh run download ${runId} --repo ${repo} --pattern "skill-eval-results-*" --dir ./eval-results\`, ` +
	`then fetch https://raw.githubusercontent.com/dotnet/skills/main/eng/skill-validator/src/docs/InvestigatingResults.md ` +
	`and follow it to analyze the results.json files. Diagnose each failure, suggest fixes to the eval.yaml ` +
	`and skill content, and tell me what to fix first._`,
	].join('\n');
	}

	// ── Pipeline link (styled like dotnet/skills) ─────────
	lines.push(`[🔍 Full results and investigation steps](${runUrl})`);

	const body = lines.join('\n');

	// ── Write step summary with investigation prompt ──────
	const summaryPath = process.env.GITHUB_STEP_SUMMARY;
	if (summaryPath) {
	const summaryLines = ['## Skill Validation Results', ''];
	summaryLines.push(body.replace(marker, '').trim());
	if (investigatePrompt) {
	summaryLines.push(investigatePrompt);
	}
	fs.appendFileSync(summaryPath, summaryLines.join('\n') + '\n');
	}

	// Upsert comment (find existing with marker, update or create)
	// Paginate to handle PRs with 100+ comments
	const comments = await github.paginate(
	github.rest.issues.listComments,
	{
	owner: context.repo.owner,
	repo: context.repo.repo,
	issue_number: prNumber,
	per_page: 100,
	}
	);

	const existing = comments.find(c => c.body && c.body.includes(marker));

	if (existing) {
	await github.rest.issues.updateComment({
	owner: context.repo.owner,
	repo: context.repo.repo,
	comment_id: existing.id,
	body,
	});
	console.log(`Updated existing comment ${existing.id}`);
	} else {
	await github.rest.issues.createComment({
	owner: context.repo.owner,
	repo: context.repo.repo,
	issue_number: prNumber,
	body,
	});
	console.log('Created new PR comment');
	}

	// Save eval pass/fail for downstream jobs
	const outputPath = process.env.GITHUB_OUTPUT;
	if (outputPath) {
	fs.appendFileSync(outputPath, `eval_passed=${hasResults ? evalPassed : 'na'}\n`);
	}

	# ==========================================================================
	# REPORT STATUS
	# Post final commit status on PR head SHA.
	# ==========================================================================
	report-status:
	name: Report status
	needs: [pr-gate, slash-gate, static-check, discover-eval, evaluate, comment]
	if: >-
	always() && !cancelled() && (
	needs.pr-gate.result == 'success' \|\|
	needs.slash-gate.result == 'success'
	)
	runs-on: ubuntu-latest
	permissions:
	statuses: write
	checks: write
	issues: write
	steps:
	- name: Set commit status
	env:
	GH_TOKEN: ${{ github.token }}
	run: \|
	HEAD_SHA="${{ needs.pr-gate.outputs.head_sha \|\| needs.slash-gate.outputs.head_sha }}"
	if [ -z "$HEAD_SHA" ]; then
	echo "No head SHA (workflow_dispatch?) — skipping status"
	exit 0
	fi

	STATIC="${{ needs.static-check.result }}"
	EVAL="${{ needs.evaluate.result }}"
	DISCOVER="${{ needs.discover-eval.result }}"
	HAS_ENTRIES="${{ needs.discover-eval.outputs.has_entries }}"
	EVAL_PASSED="${{ needs.comment.outputs.eval_passed }}"

	if [[ "$STATIC" == "success" ]]; then
	if [[ "$DISCOVER" != "success" \|\| "$HAS_ENTRIES" != "true" ]]; then
	STATE="success"
	DESC="Skill validation passed (static only)"
	elif [[ "$EVAL_PASSED" == "true" ]]; then
	STATE="success"
	DESC="Skill validation passed"
	elif [[ "$EVAL_PASSED" == "false" ]]; then
	STATE="failure"
	DESC="LLM evaluation failed"
	elif [[ "$EVAL" == "failure" ]]; then
	STATE="failure"
	DESC="LLM evaluation failed"
	else
	STATE="error"
	DESC="Evaluation incomplete ($EVAL)"
	fi
	elif [[ "$STATIC" == "failure" ]]; then
	STATE="failure"
	DESC="Static validation failed"
	else
	STATE="error"
	DESC="Validation incomplete (static: $STATIC)"
	fi

	# Post commit status (appears in PR status checks)
	gh api "repos/${{ github.repository }}/statuses/${HEAD_SHA}" \
	-f state="$STATE" \
	-f context="skill-validation" \
	-f description="$DESC" \
	-f target_url="${{ github.server_url }}/${{ github.repository }}/actions/runs/${{ github.run_id }}"

	# Create a Check Run on the PR head SHA so it shows in the PR Checks tab.
	# pull_request_target runs are associated with the base branch, not the PR,
	# so without this the workflow link won't appear on the PR.
	- name: Create check run
	if: always()
	uses: actions/github-script@v7
	with:
	script: \|
	const headSha = '${{ needs.pr-gate.outputs.head_sha \|\| needs.slash-gate.outputs.head_sha }}';
	if (!headSha) return;

	const staticResult = '${{ needs.static-check.result }}';
	const evalPassed = '${{ needs.comment.outputs.eval_passed }}';
	const evalResult = '${{ needs.evaluate.result }}';
	const hasEntries = '${{ needs.discover-eval.outputs.has_entries }}' === 'true';

	let conclusion, title;
	if (staticResult !== 'success') {
	conclusion = 'failure';
	title = 'Static validation failed';
	} else if (!hasEntries) {
	conclusion = 'success';
	title = 'Skill validation passed (static only)';
	} else if (evalPassed === 'true') {
	conclusion = 'success';
	title = 'Skill validation passed';
	} else if (evalPassed === 'false') {
	conclusion = 'failure';
	title = 'LLM evaluation failed';
	} else if (evalResult === 'failure') {
	conclusion = 'failure';
	title = 'LLM evaluation failed';
	} else {
	conclusion = 'neutral';
	title = `Evaluation: ${evalResult}`;
	}

	await github.rest.checks.create({
	owner: context.repo.owner,
	repo: context.repo.repo,
	name: 'Skill Validation',
	head_sha: headSha,
	status: 'completed',
	conclusion,
	output: {
	title,
	summary: `[View full results](${context.serverUrl}/${context.repo.owner}/${context.repo.repo}/actions/runs/${context.runId})`,
	},
	details_url: `${context.serverUrl}/${context.repo.owner}/${context.repo.repo}/actions/runs/${context.runId}`,
	});
	console.log(`Created check run: ${conclusion} - ${title}`);

	# Remove eyes reaction (slash command only)
	- name: Remove reaction
	if: needs.slash-gate.result == 'success'
	env:
	GH_TOKEN: ${{ github.token }}
	run: \|
	COMMENT_ID="${{ github.event.comment.id }}"
	REACTION_ID=$(gh api "repos/${{ github.repository }}/issues/comments/${COMMENT_ID}/reactions" \
	--jq '.[] \| select(.content == "eyes" and .user.login == "github-actions[bot]") \| .id' \
	\| head -1 \|\| echo "")
	if [[ -n "$REACTION_ID" && "$REACTION_ID" != "null" ]]; then
	gh api "repos/${{ github.repository }}/issues/comments/${COMMENT_ID}/reactions/${REACTION_ID}" \
	-X DELETE \|\| true
	fi

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

Skill Validation #3614

Workflow file

Skill Validation #3614

Uh oh!

Workflow file for this run