-
Notifications
You must be signed in to change notification settings - Fork 188
168 lines (143 loc) · 5.48 KB
/
skill-evaluation.yml
File metadata and controls
168 lines (143 loc) · 5.48 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
name: Copilot SDK Tests
on:
schedule:
# Run nightly at 2 AM UTC
- cron: '0 2 * * *'
workflow_dispatch:
inputs:
skill:
description: 'Skill to evaluate (leave empty for all)'
required: false
type: string
ralph:
description: 'Enable Ralph Loop (iterative improvement)'
required: false
type: boolean
default: false
threshold:
description: 'Quality threshold (0-100)'
required: false
type: number
default: 60
max_iterations:
description: 'Max Ralph Loop iterations'
required: false
type: number
default: 2
jobs:
evaluate:
runs-on: ubuntu-latest
# Only run if COPILOT_TOKEN secret is configured
if: ${{ vars.ENABLE_REAL_EVAL == 'true' || github.event_name == 'workflow_dispatch' }}
env:
# PAT with "Copilot Requests" permission for SDK authentication
GH_TOKEN: ${{ secrets.COPILOT_TOKEN }}
steps:
- uses: actions/checkout@v4
- uses: pnpm/action-setup@v4
with:
version: 9
- uses: actions/setup-node@v4
with:
node-version: '20'
cache: 'pnpm'
cache-dependency-path: tests/pnpm-lock.yaml
- name: Install Copilot CLI
run: npm install -g @github/copilot
- name: Verify Copilot CLI
run: copilot --version
- name: Install test dependencies
working-directory: tests
run: pnpm install
- name: TypeScript check
working-directory: tests
run: pnpm typecheck
- name: Run skill evaluations
id: harness
working-directory: tests
run: |
set +e
SKILL_ARG="${{ inputs.skill }}"
RALPH_FLAG=""
THRESHOLD="${{ inputs.threshold || 60 }}"
MAX_ITER="${{ inputs.max_iterations || 2 }}"
if [ "${{ inputs.ralph }}" = "true" ]; then
RALPH_FLAG="--ralph --threshold $THRESHOLD --max-iterations $MAX_ITER"
fi
if [ -n "$SKILL_ARG" ]; then
# Single skill evaluation
pnpm harness "$SKILL_ARG" $RALPH_FLAG --verbose --output markdown --output-file results.md
HARNESS_EXIT=$?
pnpm harness "$SKILL_ARG" $RALPH_FLAG --output json --output-file results.json
else
# All skills evaluation
pnpm harness --all $RALPH_FLAG --verbose --output markdown --output-file results.md
HARNESS_EXIT=$?
pnpm harness --all $RALPH_FLAG --verbose --output json --output-file results.json
fi
set -e
echo "exit_code=$HARNESS_EXIT" >> $GITHUB_OUTPUT
exit 0
- name: Write job summary
if: always()
working-directory: tests
run: |
echo "## Copilot SDK Tests Results" >> $GITHUB_STEP_SUMMARY
echo "" >> $GITHUB_STEP_SUMMARY
echo "**Mode:** Real Copilot SDK evaluation (nightly)" >> $GITHUB_STEP_SUMMARY
if [ "${{ inputs.ralph }}" = "true" ]; then
echo "**Ralph Loop:** Enabled (threshold: ${{ inputs.threshold || 60 }}, max iterations: ${{ inputs.max_iterations || 2 }})" >> $GITHUB_STEP_SUMMARY
fi
echo "" >> $GITHUB_STEP_SUMMARY
if [ -f results.md ]; then
cat results.md >> $GITHUB_STEP_SUMMARY
else
echo "No results file found" >> $GITHUB_STEP_SUMMARY
fi
if [ "${{ steps.harness.outputs.exit_code }}" != "0" ]; then
echo "" >> $GITHUB_STEP_SUMMARY
echo "---" >> $GITHUB_STEP_SUMMARY
echo "Some skills have failing scenarios. See details above." >> $GITHUB_STEP_SUMMARY
fi
- name: Upload results
if: always()
uses: actions/upload-artifact@v4
with:
name: real-evaluation-results
path: |
tests/results.md
tests/results.json
retention-days: 7
if-no-files-found: warn
- name: Check evaluation results
if: always()
run: |
if [ "${{ steps.harness.outputs.exit_code }}" != "0" ]; then
echo "::warning::Some skills have failing scenarios in real SDK evaluation"
fi
- name: Add detailed failure annotations
if: always()
working-directory: tests
run: |
if [ "${{ steps.harness.outputs.exit_code }}" != "0" ] && [ -f results.json ]; then
node - <<'EOF'
const fs = require('fs');
const raw = fs.readFileSync('results.json', 'utf-8');
const data = JSON.parse(raw);
const skills = data.skills ?? [];
for (const skill of skills) {
const results = skill.results ?? [];
for (const result of results) {
if (result.passed) continue;
const findings = result.findings ?? [];
const failures = findings.filter(f => f.severity === 'error');
if (failures.length === 0) continue;
const top = failures.slice(0, 3);
const details = top.map(f => `- ${f.message}${f.suggestion ? ` (💡 ${f.suggestion})` : ''}`).join(' ');
const summary = `${skill.skill_name} / ${result.scenario} failed (score: ${Number(result.score).toFixed(1)})`;
const message = `${summary} ${details}`;
console.log(`::error::${message}`);
}
}
EOF
fi