skills/.github/workflows/skill-evaluation.yml at 34dddbd53c2226379c6da1dac1e05223554a821e · microsoft/skills · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
name: Copilot SDK Tests

on:
  schedule:
    # Run nightly at 2 AM UTC
    - cron: '0 2 * * *'
  workflow_dispatch:
    inputs:
      skill:
        description: 'Skill to evaluate (leave empty for all)'
        required: false
        type: string
      ralph:
        description: 'Enable Ralph Loop (iterative improvement)'
        required: false
        type: boolean
        default: false
      threshold:
        description: 'Quality threshold (0-100)'
        required: false
        type: number
        default: 60
      max_iterations:
        description: 'Max Ralph Loop iterations'
        required: false
        type: number
        default: 2

jobs:
  evaluate:
    runs-on: ubuntu-latest
    # Only run if COPILOT_TOKEN secret is configured
    if: ${{ vars.ENABLE_REAL_EVAL == 'true' || github.event_name == 'workflow_dispatch' }}

    env:
      # PAT with "Copilot Requests" permission for SDK authentication
      GH_TOKEN: ${{ secrets.COPILOT_TOKEN }}

    steps:
      - uses: actions/checkout@v4

      - uses: pnpm/action-setup@v4
        with:
          version: 9

      - uses: actions/setup-node@v4
        with:
          node-version: '20'
          cache: 'pnpm'
          cache-dependency-path: tests/pnpm-lock.yaml

      - name: Install Copilot CLI
        run: npm install -g @github/copilot

      - name: Verify Copilot CLI
        run: copilot --version

      - name: Install test dependencies
        working-directory: tests
        run: pnpm install

      - name: TypeScript check
        working-directory: tests
        run: pnpm typecheck

      - name: Run skill evaluations
        id: harness
        working-directory: tests
        run: |
          set +e

          SKILL_ARG="${{ inputs.skill }}"
          RALPH_FLAG=""
          THRESHOLD="${{ inputs.threshold || 60 }}"
          MAX_ITER="${{ inputs.max_iterations || 2 }}"

          if [ "${{ inputs.ralph }}" = "true" ]; then
            RALPH_FLAG="--ralph --threshold $THRESHOLD --max-iterations $MAX_ITER"
          fi

          if [ -n "$SKILL_ARG" ]; then
            # Single skill evaluation
            pnpm harness "$SKILL_ARG" $RALPH_FLAG --verbose --output markdown --output-file results.md
            HARNESS_EXIT=$?
            pnpm harness "$SKILL_ARG" $RALPH_FLAG --output json --output-file results.json
          else
            # All skills evaluation
            pnpm harness --all $RALPH_FLAG --verbose --output markdown --output-file results.md
            HARNESS_EXIT=$?
            pnpm harness --all $RALPH_FLAG --verbose --output json --output-file results.json
          fi

          set -e
          echo "exit_code=$HARNESS_EXIT" >> $GITHUB_OUTPUT
          exit 0

      - name: Write job summary
        if: always()
        working-directory: tests
        run: |
          echo "## Copilot SDK Tests Results" >> $GITHUB_STEP_SUMMARY
          echo "" >> $GITHUB_STEP_SUMMARY
          echo "**Mode:** Real Copilot SDK evaluation (nightly)" >> $GITHUB_STEP_SUMMARY
          if [ "${{ inputs.ralph }}" = "true" ]; then
            echo "**Ralph Loop:** Enabled (threshold: ${{ inputs.threshold || 60 }}, max iterations: ${{ inputs.max_iterations || 2 }})" >> $GITHUB_STEP_SUMMARY
          fi
          echo "" >> $GITHUB_STEP_SUMMARY

          if [ -f results.md ]; then
            cat results.md >> $GITHUB_STEP_SUMMARY
          else
            echo "No results file found" >> $GITHUB_STEP_SUMMARY
          fi

          if [ "${{ steps.harness.outputs.exit_code }}" != "0" ]; then
            echo "" >> $GITHUB_STEP_SUMMARY
            echo "---" >> $GITHUB_STEP_SUMMARY
            echo "Some skills have failing scenarios. See details above." >> $GITHUB_STEP_SUMMARY
          fi

      - name: Upload results
        if: always()
        uses: actions/upload-artifact@v4
        with:
          name: real-evaluation-results
          path: |
            tests/results.md
            tests/results.json
          retention-days: 7
          if-no-files-found: warn

      - name: Check evaluation results
        if: always()
        run: |
          if [ "${{ steps.harness.outputs.exit_code }}" != "0" ]; then
            echo "::warning::Some skills have failing scenarios in real SDK evaluation"
          fi

      - name: Add detailed failure annotations
        if: always()
        working-directory: tests
        run: |
          if [ "${{ steps.harness.outputs.exit_code }}" != "0" ] && [ -f results.json ]; then
            node - <<'EOF'
            const fs = require('fs');

            const raw = fs.readFileSync('results.json', 'utf-8');
            const data = JSON.parse(raw);
            const skills = data.skills ?? [];

            for (const skill of skills) {
              const results = skill.results ?? [];
              for (const result of results) {
                if (result.passed) continue;
                const findings = result.findings ?? [];
                const failures = findings.filter(f => f.severity === 'error');
                if (failures.length === 0) continue;

                const top = failures.slice(0, 3);
                const details = top.map(f => `- ${f.message}${f.suggestion ? ` (💡 ${f.suggestion})` : ''}`).join(' ');
                const summary = `${skill.skill_name} / ${result.scenario} failed (score: ${Number(result.score).toFixed(1)})`;
                const message = `${summary} ${details}`;

                console.log(`::error::${message}`);
              }
            }
          EOF
          fi