feat(tests): add smoke tests for uipath-rpa skill #71
Workflow file for this run
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| name: Smoke RPA Skill Tests (Windows) | |
| concurrency: | |
| group: smoke-rpa-skills-${{ github.head_ref || github.ref }} | |
| cancel-in-progress: true | |
| on: | |
| pull_request: | |
| paths: | |
| - 'skills/uipath-rpa/**' | |
| - 'skills/uipath-rpa-legacy/**' | |
| - 'tests/tasks/uipath-rpa/**' | |
| - 'tests/tasks/uipath-rpa-legacy/**' | |
| - 'tests/experiments/**' | |
| - '.github/workflows/smoke-rpa-skills.yml' | |
| workflow_dispatch: | |
| jobs: | |
| detect: | |
| runs-on: ubuntu-latest | |
| name: Detect changed RPA skills | |
| outputs: | |
| task_globs: ${{ steps.detect.outputs.task_globs }} | |
| skip: ${{ steps.detect.outputs.skip }} | |
| steps: | |
| - uses: actions/checkout@v4 | |
| with: | |
| fetch-depth: 0 | |
| - name: Detect changed RPA skills and map to test tasks | |
| id: detect | |
| run: | | |
| BASE_REF="${{ github.base_ref }}" | |
| if [ -z "$BASE_REF" ]; then BASE_REF="main"; fi | |
| CHANGED=$(git diff --name-only origin/$BASE_REF...HEAD) | |
| # Infra change → run all RPA + rpa-legacy smoke tasks | |
| if echo "$CHANGED" | grep -qE '^tests/(experiments|_shared)/|^tests/[^/]+\.(py|yaml|toml)$|^\.github/workflows/smoke-rpa-skills\.yml$'; then | |
| echo "task_globs=tasks/uipath-rpa/**/*.yaml tasks/uipath-rpa-legacy/**/*.yaml" >> "$GITHUB_OUTPUT" | |
| echo "Running all RPA smoke tests (test infrastructure changed)" | |
| exit 0 | |
| fi | |
| GLOBS="" | |
| if echo "$CHANGED" | grep -qE '^(skills/uipath-rpa/|tests/tasks/uipath-rpa/)'; then | |
| [ -d "tests/tasks/uipath-rpa" ] && GLOBS="tasks/uipath-rpa/**/*.yaml" | |
| fi | |
| if echo "$CHANGED" | grep -qE '^(skills/uipath-rpa-legacy/|tests/tasks/uipath-rpa-legacy/)'; then | |
| if [ -d "tests/tasks/uipath-rpa-legacy" ]; then | |
| [ -n "$GLOBS" ] && GLOBS="$GLOBS tasks/uipath-rpa-legacy/**/*.yaml" || GLOBS="tasks/uipath-rpa-legacy/**/*.yaml" | |
| fi | |
| fi | |
| if [ -z "$GLOBS" ]; then | |
| echo "skip=true" >> "$GITHUB_OUTPUT" | |
| echo "No RPA skill changes detected — skipping" | |
| else | |
| echo "task_globs=$GLOBS" >> "$GITHUB_OUTPUT" | |
| echo "Will test: $GLOBS" | |
| fi | |
| smoke: | |
| needs: detect | |
| if: needs.detect.outputs.skip != 'true' | |
| runs-on: windows-latest | |
| # RPA smoke tasks spin up Helm on each `uip rpa` call (30–60s each). A | |
| # typical 3-task smoke run with cold Helm + three tasks runs ~25–40 | |
| # min; bump the ceiling so we don't cancel mid-task. | |
| timeout-minutes: 90 | |
| name: Run RPA skill smoke tests (Windows) | |
| steps: | |
| - uses: actions/checkout@v4 | |
| - uses: actions/checkout@v4 | |
| with: | |
| repository: UiPath/coder_eval | |
| token: ${{ secrets.GH_PAT }} | |
| path: .coder_eval | |
| - uses: actions/setup-python@v5 | |
| with: | |
| python-version: '3.13' | |
| - uses: astral-sh/setup-uv@v4 | |
| - uses: actions/setup-node@v4 | |
| with: | |
| node-version: '20' | |
| - uses: actions/setup-dotnet@v4 | |
| with: | |
| dotnet-version: '8.0.x' | |
| - name: Install coder-eval | |
| working-directory: .coder_eval | |
| env: | |
| UV_EXTRA_INDEX_URL: "https://${{ secrets.UV_INDEX_UIPATH_USERNAME }}:${{ secrets.UV_INDEX_UIPATH_PASSWORD }}@uipath.pkgs.visualstudio.com/_packaging/ml-packages/pypi/simple/" | |
| run: uv pip install --system . | |
| - name: Configure NuGet feed for Helm packages | |
| shell: bash | |
| run: | | |
| dotnet nuget add source \ | |
| "https://uipath.pkgs.visualstudio.com/Public.Feeds/_packaging/UiPath-Internal/nuget/v3/index.json" \ | |
| --name UiPath-Internal \ | |
| --username az \ | |
| --password "${{ secrets.UV_INDEX_UIPATH_PASSWORD }}" \ | |
| --store-password-in-clear-text | |
| # Install from public npm at @latest. The `--@uipath:registry=` | |
| # flag forces the @uipath scope to public npm even if some | |
| # `.npmrc` (runner image, user-level, or future setup-node config) | |
| # maps it elsewhere — notably the internal GitHub Packages feed, | |
| # which carries divergent 1.0.0-alpha.* prereleases under the | |
| # same scope. Plain `--registry=` does NOT bypass scope mappings; | |
| # only the scope-specific override does. `npm install -g` lands | |
| # under `<npm-prefix>/@uipath/`, where uipcli's ToolManager | |
| # discovers tools, so `uip rpa …` / `uip rpa-legacy …` resolve | |
| # without triggering auto-install. | |
| - name: Install uip CLI + RPA tools (public npm @latest) | |
| shell: bash | |
| run: | | |
| set -e | |
| npm install -g \ | |
| --@uipath:registry=https://registry.npmjs.org/ \ | |
| @uipath/cli@latest @uipath/rpa-tool@latest @uipath/rpa-legacy-tool@latest | |
| uip --version | |
| uip tools list --output json | |
| # Pre-auth uip as a real licensed Studio user via Studio's own e2e | |
| # auth helper (vendored from Studio/.ci/helm-e2e/oauth-login.mjs). | |
| # Why this and not client_credentials: | |
| # * Helm's HelmFeatureGate runs TWO checks: sign-in state AND | |
| # license SKU entitlement (HelmLicenseSkuFeatureSourceService). | |
| # A licensed user account carries the Studio SKU and clears | |
| # both gates; an External App / client_credentials principal | |
| # typically doesn't. | |
| # * Studio Desktop scopes (StudioWebBackend, OrchestratorApiUserAccess, | |
| # LLMGateway, ...) are user-only — not requestable via | |
| # client_credentials grant at all. | |
| # * Studio engineers themselves use this exact script to drive | |
| # Helm e2e; we get free upkeep when login UI changes. | |
| # The script drives a headless Puppeteer browser through the | |
| # auth-code+PKCE flow, exchanges the code for tokens, then writes | |
| # the standard ~/.uipath/.auth file the JS CLI reads on disk | |
| # (refresh token included — much longer effective lifetime than | |
| # env-auth's ~1h). | |
| - name: Install Puppeteer (auth helper dep) | |
| shell: bash | |
| run: npm install --no-save puppeteer | |
| - name: Authenticate uip as licensed Studio user | |
| id: auth | |
| shell: bash | |
| env: | |
| AUTHORITY: https://alpha.uipath.com | |
| EMAIL: ${{ secrets.UIPATH_EMAIL }} | |
| PASSWORD: ${{ secrets.UIPATH_PASSWORD }} | |
| TENANT: ${{ secrets.UIPATH_TENANT }} | |
| ORG: ${{ secrets.UIPATH_ORG }} | |
| # Drop screenshots + DOM dumps here so we can upload them as an | |
| # artifact for post-mortem on auth failures. | |
| AUTH_DEBUG_DIR: ${{ github.workspace }}/auth-debug | |
| run: | | |
| set -euo pipefail | |
| mkdir -p "$AUTH_DEBUG_DIR" | |
| node .github/scripts/uipath-oauth-login.mjs | |
| # Smoke check the auth context is recognized. Failure here fails | |
| # the job — we'd rather not run smoke tasks against a broken | |
| # login state and chase mysterious sign-in errors per task. | |
| uip login status --output json | |
| # Always upload the auth-debug screenshots / dumps so we have a | |
| # post-mortem artifact even on success (baseline for comparison). | |
| - name: Upload auth-debug artifacts | |
| if: always() && steps.auth.outcome != 'skipped' | |
| uses: actions/upload-artifact@v4 | |
| with: | |
| name: auth-debug-${{ github.run_id }} | |
| path: ${{ github.workspace }}/auth-debug | |
| if-no-files-found: ignore | |
| retention-days: 7 | |
| - name: Pre-warm Helm (download NuGet package before tests) | |
| shell: bash | |
| run: | | |
| mkdir -p /tmp/helm-warmup && cd /tmp/helm-warmup | |
| uip rpa list-instances --output json 2>&1 || true | |
| taskkill //F //IM UiPath.Studio.Helm.exe 2>/dev/null || true | |
| - name: Run RPA smoke tests | |
| env: | |
| SKILLS_REPO_PATH: ${{ github.workspace }} | |
| # Route the agent through AWS Bedrock. The Anthropic workspace | |
| # hit its API usage limit (resets 2026-05-01), so DirectRoute | |
| # is blocked. Bedrock has a separate quota. | |
| # | |
| # ANTHROPIC_API_KEY stays set for the LLM reviewer (lightweight | |
| # token spend — coder_eval's reviewer only supports Anthropic | |
| # direct or UiPath LLM Gateway, not Bedrock). If the shared | |
| # workspace quota is still drained when the reviewer fires, | |
| # orchestrator._run_final_llm_review swallows the error and | |
| # the task still records its deterministic criteria result. | |
| API_BACKEND: bedrock | |
| AWS_BEARER_TOKEN_BEDROCK: ${{ secrets.AWS_BEARER_TOKEN_BEDROCK }} | |
| AWS_REGION: ${{ secrets.AWS_REGION }} | |
| BEDROCK_MODEL: ${{ secrets.BEDROCK_MODEL }} | |
| ANTHROPIC_API_KEY: ${{ secrets.ANTHROPIC_API_KEY }} | |
| working-directory: tests | |
| id: smoke | |
| shell: bash | |
| run: | | |
| # Run each task separately, killing Helm between tasks so one | |
| # task's stale Studio state doesn't leak into the next. Track | |
| # per-task exit codes so one failure doesn't abort the loop | |
| # but still propagates to the overall step outcome. | |
| shopt -s globstar nullglob | |
| overall_exit=0 | |
| for task in ${{ needs.detect.outputs.task_globs }}; do | |
| echo "--- Killing leftover Helm/Studio processes ---" | |
| taskkill //F //IM UiPath.Studio.Helm.exe 2>/dev/null || true | |
| echo "--- Running: $task ---" | |
| # Per-task Bedrock model override. The Anthropic Sonnet | |
| # family (4.5 on Bedrock, 4.6 on Anthropic-direct) trips | |
| # the post-generation content filter when emitting the | |
| # full legacy XAML baseline (21 namespaces + 16 assembly | |
| # refs, including PresentationFramework / PresentationCore | |
| # / WindowsBase / System.Xaml / mscorlib + | |
| # Microsoft.VisualBasic.Activities). Opus emits it | |
| # cleanly. coder_eval's BedrockRoute.model overrides | |
| # task-level model, so the only way to swap per-task is | |
| # at the env level here. | |
| task_model="$BEDROCK_MODEL" | |
| case "$task" in | |
| *rpa-legacy*) task_model="eu.anthropic.claude-opus-4-6-v1" ;; | |
| esac | |
| if ! BEDROCK_MODEL="$task_model" coder-eval run "$task" \ | |
| -e experiments/default.yaml --tags smoke -j 1 -v; then | |
| overall_exit=1 | |
| fi | |
| done | |
| exit $overall_exit | |
| continue-on-error: true | |
| - name: Summarize results | |
| if: always() | |
| working-directory: tests | |
| shell: bash | |
| run: | | |
| run_dir=$(ls -td runs/*/ 2>/dev/null | head -n 1) | |
| if [ -z "$run_dir" ]; then | |
| echo "::warning::No run directory found — nothing to summarize" | |
| exit 0 | |
| fi | |
| echo "Run directory: $run_dir" | |
| echo "" | |
| echo "## Task results" | |
| python - <<'PY' | |
| import json, pathlib | |
| run_dirs = sorted(pathlib.Path('runs').glob('*/')) | |
| rows = [] | |
| for run_dir in run_dirs: | |
| for p in sorted(run_dir.rglob('task.json')): | |
| d = json.loads(p.read_text(encoding='utf-8')) | |
| lr = d.get('llm_review') or {} | |
| rows.append(( | |
| d.get('task_id', '?'), | |
| d.get('final_status', '?'), | |
| d.get('weighted_score'), | |
| lr.get('score'), | |
| )) | |
| if not rows: | |
| print(' (no task.json files found)') | |
| width = max((len(r[0]) for r in rows), default=4) | |
| for task_id, status, score, llm_score in rows: | |
| score_s = f"{score:.2f}" if isinstance(score, (int, float)) else ' —' | |
| llm_s = f"{llm_score:.2f}" if isinstance(llm_score, (int, float)) else ' —' | |
| print(f" {task_id:<{width}} status={status:<8} score={score_s} llm={llm_s}") | |
| PY | |
| echo "" | |
| echo "HTML report artifact: eval-report-${{ github.run_id }}" | |
| echo "Download under the workflow run's Artifacts section and open experiment.html." | |
| - name: Upload HTML / JSON eval report | |
| if: always() | |
| uses: actions/upload-artifact@v4 | |
| with: | |
| name: eval-report-${{ github.run_id }} | |
| path: | | |
| tests/runs/*/experiment.html | |
| tests/runs/*/experiment.md | |
| tests/runs/*/experiment.json | |
| tests/runs/*/experiment.log | |
| tests/runs/*/*/variant.html | |
| tests/runs/*/*/variant.md | |
| tests/runs/*/*/variant.json | |
| tests/runs/**/task.html | |
| tests/runs/**/task.json | |
| tests/runs/**/task.log | |
| if-no-files-found: warn | |
| retention-days: 14 | |
| - name: Enforce LLM reviewer score threshold (>= 0.7) | |
| if: always() | |
| working-directory: tests | |
| shell: bash | |
| env: | |
| # Reviewer is a qualitative gate and tends to dock for minor | |
| # process issues the agent recovers from (e.g. late-added | |
| # project.uiproj, `uip rpa get-errors` exit 1 when the agent | |
| # falls back to `uip rpa build`). 0.7 keeps the gate meaningful | |
| # without false-failing correct-but-imperfect runs. | |
| REVIEWER_THRESHOLD: "0.7" | |
| run: | | |
| python - <<'PY' | |
| import json, os, pathlib, sys | |
| threshold = float(os.environ.get('REVIEWER_THRESHOLD', '0.8')) | |
| run_dirs = sorted(pathlib.Path('runs').glob('*/')) | |
| if not run_dirs: | |
| print('::warning::No run directory — skipping reviewer threshold check') | |
| sys.exit(0) | |
| task_jsons = [] | |
| for run_dir in run_dirs: | |
| task_jsons.extend(sorted(run_dir.rglob('task.json'))) | |
| failures = [] | |
| missing = [] | |
| for p in task_jsons: | |
| d = json.loads(p.read_text(encoding='utf-8')) | |
| lr = d.get('llm_review') or {} | |
| score = lr.get('score') | |
| tid = d.get('task_id', p.parent.name) | |
| if not isinstance(score, (int, float)): | |
| missing.append(tid) | |
| continue | |
| if score < threshold: | |
| issues = (lr.get('issues') or '').splitlines()[0][:120] | |
| failures.append((tid, score, issues)) | |
| for tid in missing: | |
| print(f'::warning::{tid}: no llm_review (reviewer skipped or errored)') | |
| if failures: | |
| for tid, score, issues in failures: | |
| print(f'::error::{tid}: llm_review score {score:.2f} < {threshold} — {issues}') | |
| print(f'::error::{len(failures)} task(s) below LLM reviewer threshold {threshold}') | |
| sys.exit(1) | |
| print(f'All {len(task_jsons)} task(s) passed llm_review >= {threshold}') | |
| PY | |
| - name: Check test results | |
| if: always() | |
| working-directory: tests | |
| shell: bash | |
| run: | | |
| shopt -s globstar nullglob | |
| mapfile -t TASK_JSONS < <(printf '%s\n' runs/**/task.json) | |
| if [ ${#TASK_JSONS[@]} -eq 0 ]; then | |
| echo "::error::No task results found" | |
| exit 1 | |
| fi | |
| total=${#TASK_JSONS[@]} | |
| passed=$(grep -l '"final_status": "SUCCESS"' "${TASK_JSONS[@]}" | wc -l) | |
| echo "Results: $passed/$total tasks passed" | |
| if [ "$passed" -lt "$total" ]; then | |
| echo "::error::$((total - passed)) task(s) failed — download the eval-report-${{ github.run_id }} artifact and open task.html for each failed task" | |
| exit 1 | |
| fi | |
| - name: Fail if smoke step failed or was cancelled | |
| if: steps.smoke.outcome == 'failure' || steps.smoke.outcome == 'cancelled' | |
| shell: bash | |
| run: exit 1 |