Skip to content

feat(tests): add smoke tests for uipath-rpa skill #71

feat(tests): add smoke tests for uipath-rpa skill

feat(tests): add smoke tests for uipath-rpa skill #71

name: Smoke RPA Skill Tests (Windows)
concurrency:
group: smoke-rpa-skills-${{ github.head_ref || github.ref }}
cancel-in-progress: true
on:
pull_request:
paths:
- 'skills/uipath-rpa/**'
- 'skills/uipath-rpa-legacy/**'
- 'tests/tasks/uipath-rpa/**'
- 'tests/tasks/uipath-rpa-legacy/**'
- 'tests/experiments/**'
- '.github/workflows/smoke-rpa-skills.yml'
workflow_dispatch:
jobs:
detect:
runs-on: ubuntu-latest
name: Detect changed RPA skills
outputs:
task_globs: ${{ steps.detect.outputs.task_globs }}
skip: ${{ steps.detect.outputs.skip }}
steps:
- uses: actions/checkout@v4
with:
fetch-depth: 0
- name: Detect changed RPA skills and map to test tasks
id: detect
run: |
BASE_REF="${{ github.base_ref }}"
if [ -z "$BASE_REF" ]; then BASE_REF="main"; fi
CHANGED=$(git diff --name-only origin/$BASE_REF...HEAD)
# Infra change → run all RPA + rpa-legacy smoke tasks
if echo "$CHANGED" | grep -qE '^tests/(experiments|_shared)/|^tests/[^/]+\.(py|yaml|toml)$|^\.github/workflows/smoke-rpa-skills\.yml$'; then
echo "task_globs=tasks/uipath-rpa/**/*.yaml tasks/uipath-rpa-legacy/**/*.yaml" >> "$GITHUB_OUTPUT"
echo "Running all RPA smoke tests (test infrastructure changed)"
exit 0
fi
GLOBS=""
if echo "$CHANGED" | grep -qE '^(skills/uipath-rpa/|tests/tasks/uipath-rpa/)'; then
[ -d "tests/tasks/uipath-rpa" ] && GLOBS="tasks/uipath-rpa/**/*.yaml"
fi
if echo "$CHANGED" | grep -qE '^(skills/uipath-rpa-legacy/|tests/tasks/uipath-rpa-legacy/)'; then
if [ -d "tests/tasks/uipath-rpa-legacy" ]; then
[ -n "$GLOBS" ] && GLOBS="$GLOBS tasks/uipath-rpa-legacy/**/*.yaml" || GLOBS="tasks/uipath-rpa-legacy/**/*.yaml"
fi
fi
if [ -z "$GLOBS" ]; then
echo "skip=true" >> "$GITHUB_OUTPUT"
echo "No RPA skill changes detected — skipping"
else
echo "task_globs=$GLOBS" >> "$GITHUB_OUTPUT"
echo "Will test: $GLOBS"
fi
smoke:
needs: detect
if: needs.detect.outputs.skip != 'true'
runs-on: windows-latest
# RPA smoke tasks spin up Helm on each `uip rpa` call (30–60s each). A
# typical 3-task smoke run with cold Helm + three tasks runs ~25–40
# min; bump the ceiling so we don't cancel mid-task.
timeout-minutes: 90
name: Run RPA skill smoke tests (Windows)
steps:
- uses: actions/checkout@v4
- uses: actions/checkout@v4
with:
repository: UiPath/coder_eval
token: ${{ secrets.GH_PAT }}
path: .coder_eval
- uses: actions/setup-python@v5
with:
python-version: '3.13'
- uses: astral-sh/setup-uv@v4
- uses: actions/setup-node@v4
with:
node-version: '20'
- uses: actions/setup-dotnet@v4
with:
dotnet-version: '8.0.x'
- name: Install coder-eval
working-directory: .coder_eval
env:
UV_EXTRA_INDEX_URL: "https://${{ secrets.UV_INDEX_UIPATH_USERNAME }}:${{ secrets.UV_INDEX_UIPATH_PASSWORD }}@uipath.pkgs.visualstudio.com/_packaging/ml-packages/pypi/simple/"
run: uv pip install --system .
- name: Configure NuGet feed for Helm packages
shell: bash
run: |
dotnet nuget add source \
"https://uipath.pkgs.visualstudio.com/Public.Feeds/_packaging/UiPath-Internal/nuget/v3/index.json" \
--name UiPath-Internal \
--username az \
--password "${{ secrets.UV_INDEX_UIPATH_PASSWORD }}" \
--store-password-in-clear-text
# Install from public npm at @latest. The `--@uipath:registry=`
# flag forces the @uipath scope to public npm even if some
# `.npmrc` (runner image, user-level, or future setup-node config)
# maps it elsewhere — notably the internal GitHub Packages feed,
# which carries divergent 1.0.0-alpha.* prereleases under the
# same scope. Plain `--registry=` does NOT bypass scope mappings;
# only the scope-specific override does. `npm install -g` lands
# under `<npm-prefix>/@uipath/`, where uipcli's ToolManager
# discovers tools, so `uip rpa …` / `uip rpa-legacy …` resolve
# without triggering auto-install.
- name: Install uip CLI + RPA tools (public npm @latest)
shell: bash
run: |
set -e
npm install -g \
--@uipath:registry=https://registry.npmjs.org/ \
@uipath/cli@latest @uipath/rpa-tool@latest @uipath/rpa-legacy-tool@latest
uip --version
uip tools list --output json
# Pre-auth uip as a real licensed Studio user via Studio's own e2e
# auth helper (vendored from Studio/.ci/helm-e2e/oauth-login.mjs).
# Why this and not client_credentials:
# * Helm's HelmFeatureGate runs TWO checks: sign-in state AND
# license SKU entitlement (HelmLicenseSkuFeatureSourceService).
# A licensed user account carries the Studio SKU and clears
# both gates; an External App / client_credentials principal
# typically doesn't.
# * Studio Desktop scopes (StudioWebBackend, OrchestratorApiUserAccess,
# LLMGateway, ...) are user-only — not requestable via
# client_credentials grant at all.
# * Studio engineers themselves use this exact script to drive
# Helm e2e; we get free upkeep when login UI changes.
# The script drives a headless Puppeteer browser through the
# auth-code+PKCE flow, exchanges the code for tokens, then writes
# the standard ~/.uipath/.auth file the JS CLI reads on disk
# (refresh token included — much longer effective lifetime than
# env-auth's ~1h).
- name: Install Puppeteer (auth helper dep)
shell: bash
run: npm install --no-save puppeteer
- name: Authenticate uip as licensed Studio user
id: auth
shell: bash
env:
AUTHORITY: https://alpha.uipath.com
EMAIL: ${{ secrets.UIPATH_EMAIL }}
PASSWORD: ${{ secrets.UIPATH_PASSWORD }}
TENANT: ${{ secrets.UIPATH_TENANT }}
ORG: ${{ secrets.UIPATH_ORG }}
# Drop screenshots + DOM dumps here so we can upload them as an
# artifact for post-mortem on auth failures.
AUTH_DEBUG_DIR: ${{ github.workspace }}/auth-debug
run: |
set -euo pipefail
mkdir -p "$AUTH_DEBUG_DIR"
node .github/scripts/uipath-oauth-login.mjs
# Smoke check the auth context is recognized. Failure here fails
# the job — we'd rather not run smoke tasks against a broken
# login state and chase mysterious sign-in errors per task.
uip login status --output json
# Always upload the auth-debug screenshots / dumps so we have a
# post-mortem artifact even on success (baseline for comparison).
- name: Upload auth-debug artifacts
if: always() && steps.auth.outcome != 'skipped'
uses: actions/upload-artifact@v4
with:
name: auth-debug-${{ github.run_id }}
path: ${{ github.workspace }}/auth-debug
if-no-files-found: ignore
retention-days: 7
- name: Pre-warm Helm (download NuGet package before tests)
shell: bash
run: |
mkdir -p /tmp/helm-warmup && cd /tmp/helm-warmup
uip rpa list-instances --output json 2>&1 || true
taskkill //F //IM UiPath.Studio.Helm.exe 2>/dev/null || true
- name: Run RPA smoke tests
env:
SKILLS_REPO_PATH: ${{ github.workspace }}
# Route the agent through AWS Bedrock. The Anthropic workspace
# hit its API usage limit (resets 2026-05-01), so DirectRoute
# is blocked. Bedrock has a separate quota.
#
# ANTHROPIC_API_KEY stays set for the LLM reviewer (lightweight
# token spend — coder_eval's reviewer only supports Anthropic
# direct or UiPath LLM Gateway, not Bedrock). If the shared
# workspace quota is still drained when the reviewer fires,
# orchestrator._run_final_llm_review swallows the error and
# the task still records its deterministic criteria result.
API_BACKEND: bedrock
AWS_BEARER_TOKEN_BEDROCK: ${{ secrets.AWS_BEARER_TOKEN_BEDROCK }}
AWS_REGION: ${{ secrets.AWS_REGION }}
BEDROCK_MODEL: ${{ secrets.BEDROCK_MODEL }}
ANTHROPIC_API_KEY: ${{ secrets.ANTHROPIC_API_KEY }}
working-directory: tests
id: smoke
shell: bash
run: |
# Run each task separately, killing Helm between tasks so one
# task's stale Studio state doesn't leak into the next. Track
# per-task exit codes so one failure doesn't abort the loop
# but still propagates to the overall step outcome.
shopt -s globstar nullglob
overall_exit=0
for task in ${{ needs.detect.outputs.task_globs }}; do
echo "--- Killing leftover Helm/Studio processes ---"
taskkill //F //IM UiPath.Studio.Helm.exe 2>/dev/null || true
echo "--- Running: $task ---"
# Per-task Bedrock model override. The Anthropic Sonnet
# family (4.5 on Bedrock, 4.6 on Anthropic-direct) trips
# the post-generation content filter when emitting the
# full legacy XAML baseline (21 namespaces + 16 assembly
# refs, including PresentationFramework / PresentationCore
# / WindowsBase / System.Xaml / mscorlib +
# Microsoft.VisualBasic.Activities). Opus emits it
# cleanly. coder_eval's BedrockRoute.model overrides
# task-level model, so the only way to swap per-task is
# at the env level here.
task_model="$BEDROCK_MODEL"
case "$task" in
*rpa-legacy*) task_model="eu.anthropic.claude-opus-4-6-v1" ;;
esac
if ! BEDROCK_MODEL="$task_model" coder-eval run "$task" \
-e experiments/default.yaml --tags smoke -j 1 -v; then
overall_exit=1
fi
done
exit $overall_exit
continue-on-error: true
- name: Summarize results
if: always()
working-directory: tests
shell: bash
run: |
run_dir=$(ls -td runs/*/ 2>/dev/null | head -n 1)
if [ -z "$run_dir" ]; then
echo "::warning::No run directory found — nothing to summarize"
exit 0
fi
echo "Run directory: $run_dir"
echo ""
echo "## Task results"
python - <<'PY'
import json, pathlib
run_dirs = sorted(pathlib.Path('runs').glob('*/'))
rows = []
for run_dir in run_dirs:
for p in sorted(run_dir.rglob('task.json')):
d = json.loads(p.read_text(encoding='utf-8'))
lr = d.get('llm_review') or {}
rows.append((
d.get('task_id', '?'),
d.get('final_status', '?'),
d.get('weighted_score'),
lr.get('score'),
))
if not rows:
print(' (no task.json files found)')
width = max((len(r[0]) for r in rows), default=4)
for task_id, status, score, llm_score in rows:
score_s = f"{score:.2f}" if isinstance(score, (int, float)) else ' —'
llm_s = f"{llm_score:.2f}" if isinstance(llm_score, (int, float)) else ' —'
print(f" {task_id:<{width}} status={status:<8} score={score_s} llm={llm_s}")
PY
echo ""
echo "HTML report artifact: eval-report-${{ github.run_id }}"
echo "Download under the workflow run's Artifacts section and open experiment.html."
- name: Upload HTML / JSON eval report
if: always()
uses: actions/upload-artifact@v4
with:
name: eval-report-${{ github.run_id }}
path: |
tests/runs/*/experiment.html
tests/runs/*/experiment.md
tests/runs/*/experiment.json
tests/runs/*/experiment.log
tests/runs/*/*/variant.html
tests/runs/*/*/variant.md
tests/runs/*/*/variant.json
tests/runs/**/task.html
tests/runs/**/task.json
tests/runs/**/task.log
if-no-files-found: warn
retention-days: 14
- name: Enforce LLM reviewer score threshold (>= 0.7)
if: always()
working-directory: tests
shell: bash
env:
# Reviewer is a qualitative gate and tends to dock for minor
# process issues the agent recovers from (e.g. late-added
# project.uiproj, `uip rpa get-errors` exit 1 when the agent
# falls back to `uip rpa build`). 0.7 keeps the gate meaningful
# without false-failing correct-but-imperfect runs.
REVIEWER_THRESHOLD: "0.7"
run: |
python - <<'PY'
import json, os, pathlib, sys
threshold = float(os.environ.get('REVIEWER_THRESHOLD', '0.8'))
run_dirs = sorted(pathlib.Path('runs').glob('*/'))
if not run_dirs:
print('::warning::No run directory — skipping reviewer threshold check')
sys.exit(0)
task_jsons = []
for run_dir in run_dirs:
task_jsons.extend(sorted(run_dir.rglob('task.json')))
failures = []
missing = []
for p in task_jsons:
d = json.loads(p.read_text(encoding='utf-8'))
lr = d.get('llm_review') or {}
score = lr.get('score')
tid = d.get('task_id', p.parent.name)
if not isinstance(score, (int, float)):
missing.append(tid)
continue
if score < threshold:
issues = (lr.get('issues') or '').splitlines()[0][:120]
failures.append((tid, score, issues))
for tid in missing:
print(f'::warning::{tid}: no llm_review (reviewer skipped or errored)')
if failures:
for tid, score, issues in failures:
print(f'::error::{tid}: llm_review score {score:.2f} < {threshold} — {issues}')
print(f'::error::{len(failures)} task(s) below LLM reviewer threshold {threshold}')
sys.exit(1)
print(f'All {len(task_jsons)} task(s) passed llm_review >= {threshold}')
PY
- name: Check test results
if: always()
working-directory: tests
shell: bash
run: |
shopt -s globstar nullglob
mapfile -t TASK_JSONS < <(printf '%s\n' runs/**/task.json)
if [ ${#TASK_JSONS[@]} -eq 0 ]; then
echo "::error::No task results found"
exit 1
fi
total=${#TASK_JSONS[@]}
passed=$(grep -l '"final_status": "SUCCESS"' "${TASK_JSONS[@]}" | wc -l)
echo "Results: $passed/$total tasks passed"
if [ "$passed" -lt "$total" ]; then
echo "::error::$((total - passed)) task(s) failed — download the eval-report-${{ github.run_id }} artifact and open task.html for each failed task"
exit 1
fi
- name: Fail if smoke step failed or was cancelled
if: steps.smoke.outcome == 'failure' || steps.smoke.outcome == 'cancelled'
shell: bash
run: exit 1