Copilot SDK Tests #53

Workflow file for this run

.github/workflows/skill-evaluation.yml at 34dddbd

	name: Copilot SDK Tests

	on:
	schedule:
	# Run nightly at 2 AM UTC
	- cron: '0 2 * * *'
	workflow_dispatch:
	inputs:
	skill:
	description: 'Skill to evaluate (leave empty for all)'
	required: false
	type: string
	ralph:
	description: 'Enable Ralph Loop (iterative improvement)'
	required: false
	type: boolean
	default: false
	threshold:
	description: 'Quality threshold (0-100)'
	required: false
	type: number
	default: 60
	max_iterations:
	description: 'Max Ralph Loop iterations'
	required: false
	type: number
	default: 2

	jobs:
	evaluate:
	runs-on: ubuntu-latest
	# Only run if COPILOT_TOKEN secret is configured
	if: ${{ vars.ENABLE_REAL_EVAL == 'true' \|\| github.event_name == 'workflow_dispatch' }}

	env:
	# PAT with "Copilot Requests" permission for SDK authentication
	GH_TOKEN: ${{ secrets.COPILOT_TOKEN }}

	steps:
	- uses: actions/checkout@v4

	- uses: pnpm/action-setup@v4
	with:
	version: 9

	- uses: actions/setup-node@v4
	with:
	node-version: '20'
	cache: 'pnpm'
	cache-dependency-path: tests/pnpm-lock.yaml

	- name: Install Copilot CLI
	run: npm install -g @github/copilot

	- name: Verify Copilot CLI
	run: copilot --version

	- name: Install test dependencies
	working-directory: tests
	run: pnpm install

	- name: TypeScript check
	working-directory: tests
	run: pnpm typecheck

	- name: Run skill evaluations
	id: harness
	working-directory: tests
	run: \|
	set +e

	SKILL_ARG="${{ inputs.skill }}"
	RALPH_FLAG=""
	THRESHOLD="${{ inputs.threshold \|\| 60 }}"
	MAX_ITER="${{ inputs.max_iterations \|\| 2 }}"

	if [ "${{ inputs.ralph }}" = "true" ]; then
	RALPH_FLAG="--ralph --threshold $THRESHOLD --max-iterations $MAX_ITER"
	fi

	if [ -n "$SKILL_ARG" ]; then
	# Single skill evaluation
	pnpm harness "$SKILL_ARG" $RALPH_FLAG --verbose --output markdown --output-file results.md
	HARNESS_EXIT=$?
	pnpm harness "$SKILL_ARG" $RALPH_FLAG --output json --output-file results.json
	else
	# All skills evaluation
	pnpm harness --all $RALPH_FLAG --verbose --output markdown --output-file results.md
	HARNESS_EXIT=$?
	pnpm harness --all $RALPH_FLAG --verbose --output json --output-file results.json
	fi

	set -e
	echo "exit_code=$HARNESS_EXIT" >> $GITHUB_OUTPUT
	exit 0

	- name: Write job summary
	if: always()
	working-directory: tests
	run: \|
	echo "## Copilot SDK Tests Results" >> $GITHUB_STEP_SUMMARY
	echo "" >> $GITHUB_STEP_SUMMARY
	echo "Mode: Real Copilot SDK evaluation (nightly)" >> $GITHUB_STEP_SUMMARY
	if [ "${{ inputs.ralph }}" = "true" ]; then
	echo "Ralph Loop: Enabled (threshold: ${{ inputs.threshold \|\| 60 }}, max iterations: ${{ inputs.max_iterations \|\| 2 }})" >> $GITHUB_STEP_SUMMARY
	fi
	echo "" >> $GITHUB_STEP_SUMMARY

	if [ -f results.md ]; then
	cat results.md >> $GITHUB_STEP_SUMMARY
	else
	echo "No results file found" >> $GITHUB_STEP_SUMMARY
	fi

	if [ "${{ steps.harness.outputs.exit_code }}" != "0" ]; then
	echo "" >> $GITHUB_STEP_SUMMARY
	echo "---" >> $GITHUB_STEP_SUMMARY
	echo "Some skills have failing scenarios. See details above." >> $GITHUB_STEP_SUMMARY
	fi

	- name: Upload results
	if: always()
	uses: actions/upload-artifact@v4
	with:
	name: real-evaluation-results
	path: \|
	tests/results.md
	tests/results.json
	retention-days: 7
	if-no-files-found: warn

	- name: Check evaluation results
	if: always()
	run: \|
	if [ "${{ steps.harness.outputs.exit_code }}" != "0" ]; then
	echo "::warning::Some skills have failing scenarios in real SDK evaluation"
	fi

	- name: Add detailed failure annotations
	if: always()
	working-directory: tests
	run: \|
	if [ "${{ steps.harness.outputs.exit_code }}" != "0" ] && [ -f results.json ]; then
	node - <<'EOF'
	const fs = require('fs');

	const raw = fs.readFileSync('results.json', 'utf-8');
	const data = JSON.parse(raw);
	const skills = data.skills ?? [];

	for (const skill of skills) {
	const results = skill.results ?? [];
	for (const result of results) {
	if (result.passed) continue;
	const findings = result.findings ?? [];
	const failures = findings.filter(f => f.severity === 'error');
	if (failures.length === 0) continue;

	const top = failures.slice(0, 3);
	const details = top.map(f => `- ${f.message}${f.suggestion ? ` (💡 ${f.suggestion})` : ''}`).join(' ');
	const summary = `${skill.skill_name} / ${result.scenario} failed (score: ${Number(result.score).toFixed(1)})`;
	const message = `${summary} ${details}`;

	console.log(`::error::${message}`);
	}
	}
	EOF
	fi

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

Copilot SDK Tests #53

Workflow file

Copilot SDK Tests #53

Uh oh!

Workflow file for this run