Skip to content

feat(tests): add smoke tests for uipath-rpa skill #761

feat(tests): add smoke tests for uipath-rpa skill

feat(tests): add smoke tests for uipath-rpa skill #761

Workflow file for this run

name: Smoke Skill Tests
concurrency:
group: smoke-skills-${{ github.head_ref || github.ref }}
cancel-in-progress: true
on:
pull_request:
paths:
- 'skills/*/SKILL.md'
- 'skills/*/references/**'
- 'tests/**'
workflow_dispatch:
jobs:
detect:
runs-on: ubuntu-latest
name: Detect changed skills
outputs:
task_globs: ${{ steps.detect.outputs.task_globs }}
skip: ${{ steps.detect.outputs.skip }}
untested_skills: ${{ steps.detect.outputs.untested_skills }}
steps:
- uses: actions/checkout@v4
with:
fetch-depth: 0
- name: Detect changed skills and map to test tasks
id: detect
run: |
CHANGED=$(git diff --name-only origin/${{ github.base_ref }}...HEAD)
# RPA + rpa-legacy skills run on the Windows workflow
# (smoke-rpa-skills.yml). Helm/Studio only exists there, and
# the full lifecycle (`uip rpa` / `uip rpa-legacy` CLI) is
# what's being exercised — so we skip those tasks here.
WINDOWS_SKILLS="uipath-rpa uipath-rpa-legacy"
# If test infrastructure changed, run all smoke tests except
# the Windows-only ones.
if echo "$CHANGED" | grep -qE '^tests/(experiments|_shared)/|^tests/[^/]+\.(py|yaml|toml)$'; then
shopt -s globstar nullglob
GLOBS=""
for f in tests/tasks/**/*.yaml; do
case "$f" in
tests/tasks/uipath-rpa/*|tests/tasks/uipath-rpa-legacy/*) continue ;;
esac
GLOBS="$GLOBS ${f#tests/}"
done
GLOBS=$(echo "$GLOBS" | xargs)
if [ -z "$GLOBS" ]; then
echo "skip=true" >> "$GITHUB_OUTPUT"
echo "No non-Windows smoke tasks to run."
else
echo "task_globs=$GLOBS" >> "$GITHUB_OUTPUT"
echo "Running all non-Windows smoke tests (test infrastructure changed)"
fi
exit 0
fi
# Extract unique skill names from changed paths
SKILLS=$(echo "$CHANGED" | grep '^skills/' | sed 's|skills/\([^/]*\)/.*|\1|' | sort -u)
# Also include skills whose test tasks changed
TEST_SKILLS=$(echo "$CHANGED" | grep '^tests/tasks/' | sed 's|tests/tasks/\([^/]*\)/.*|\1|' | sort -u)
SKILLS=$(printf '%s\n%s' "$SKILLS" "$TEST_SKILLS" | sort -u | grep -v '^$' || true)
if [ -z "$SKILLS" ]; then
echo "skip=true" >> "$GITHUB_OUTPUT"
echo "No skill changes detected — skipping smoke tests"
exit 0
fi
# Build task glob pattern for changed skills that have tests.
# Skip the Windows-only RPA skills — they run on smoke-rpa-skills.yml.
GLOBS=""
UNTESTED=""
for skill in $SKILLS; do
if echo "$WINDOWS_SKILLS" | grep -qw "$skill"; then
echo "Skipping $skill (runs on Windows — see smoke-rpa-skills.yml)"
continue
fi
if [ -d "tests/tasks/$skill" ]; then
if [ -n "$GLOBS" ]; then
GLOBS="$GLOBS tasks/$skill/**/*.yaml"
else
GLOBS="tasks/$skill/**/*.yaml"
fi
echo "Will test: $skill"
else
if [ -n "$UNTESTED" ]; then
UNTESTED="$UNTESTED, $skill"
else
UNTESTED="$skill"
fi
echo "No tests for: $skill (skipping)"
fi
done
if [ -n "$UNTESTED" ]; then
echo "untested_skills=$UNTESTED" >> "$GITHUB_OUTPUT"
fi
if [ -z "$GLOBS" ]; then
echo "skip=true" >> "$GITHUB_OUTPUT"
echo "::warning::Changed skills have no smoke tests: $UNTESTED"
else
echo "task_globs=$GLOBS" >> "$GITHUB_OUTPUT"
fi
warn-untested:
needs: detect
if: needs.detect.outputs.untested_skills != ''
runs-on: ubuntu-latest
name: Warn about untested skills
permissions:
pull-requests: write
steps:
- uses: actions/github-script@v7
with:
script: |
const skills = '${{ needs.detect.outputs.untested_skills }}';
const body = `⚠️ **Smoke test coverage gap** — the following changed skills have no tests under \`tests/tasks/\`:\n\n${skills.split(', ').map(s => '- `' + s + '`').join('\n')}\n\nConsider adding smoke tests before merging.`;
// Avoid duplicate comments on repeated pushes
const { data: comments } = await github.rest.issues.listComments({
owner: context.repo.owner,
repo: context.repo.repo,
issue_number: context.issue.number,
});
const existing = comments.find(c => c.body.includes('Smoke test coverage gap'));
if (existing) {
await github.rest.issues.updateComment({
owner: context.repo.owner,
repo: context.repo.repo,
comment_id: existing.id,
body,
});
} else {
await github.rest.issues.createComment({
owner: context.repo.owner,
repo: context.repo.repo,
issue_number: context.issue.number,
body,
});
}
e2e:
needs: detect
if: needs.detect.outputs.skip != 'true'
runs-on: ubuntu-latest
# 18 smoke tasks at ~1–2 min each + env setup — 30 min was getting
# cancelled mid-run. 60 min gives headroom even if a task retries.
timeout-minutes: 60
name: Run skill smoke tests
steps:
- uses: actions/checkout@v4
- uses: actions/checkout@v4
with:
repository: UiPath/coder_eval
token: ${{ secrets.GH_PAT }}
path: .coder_eval
- uses: actions/setup-python@v5
with:
python-version: '3.13'
- uses: astral-sh/setup-uv@v4
- uses: actions/setup-node@v4
with:
node-version: '20'
- name: Install coder-eval
working-directory: .coder_eval
env:
UV_EXTRA_INDEX_URL: "https://${{ secrets.UV_INDEX_UIPATH_USERNAME }}:${{ secrets.UV_INDEX_UIPATH_PASSWORD }}@uipath.pkgs.visualstudio.com/_packaging/ml-packages/pypi/simple/"
run: uv pip install --system .
# Install from public npm at @latest. `--@uipath:registry=`
# forces the @uipath scope to public npm regardless of any
# `.npmrc` scope mapping (e.g., to the internal GitHub Packages
# feed, which carries divergent 1.0.0-alpha.* prereleases). See
# smoke-rpa-skills.yml for the full rationale.
- name: Install uip CLI (public npm @latest)
run: |
npm install -g \
--@uipath:registry=https://registry.npmjs.org/ \
@uipath/cli@latest
uip --version
# Pre-auth uip via the CLI's documented env-var bypass so non-RPA
# smoke tasks (orchestrator / integration-service / data-fabric / ...)
# have a logged-in CLI without the agent ever calling `uip login`.
# See smoke-rpa-skills.yml for the chain rationale; this Linux job
# doesn't go through Helm, but the same UIPATH_CLI_* env vars
# satisfy the JS CLI's general auth state too.
- name: Mint UiPath service-account token + enable env-auth
env:
UIPATH_CLIENT_ID: ${{ secrets.UIPATH_CLIENT_ID }}
UIPATH_CLIENT_SECRET: ${{ secrets.UIPATH_CLIENT_SECRET }}
run: |
set -euo pipefail
TOKEN=$(curl -fsS -X POST \
"https://alpha.uipath.com/identity_/connect/token" \
-H "Content-Type: application/x-www-form-urlencoded" \
-d "grant_type=client_credentials" \
-d "client_id=$UIPATH_CLIENT_ID" \
-d "client_secret=$UIPATH_CLIENT_SECRET" \
-d "scope=OR.Default OR.Execution OR.Robots OR.Machines.Read" \
| python -c "import sys,json;print(json.load(sys.stdin)['access_token'])")
echo "::add-mask::$TOKEN"
{
echo "UIPATH_CLI_ENABLE_ENV_AUTH=true"
echo "UIPATH_CLI_AUTH_TOKEN=$TOKEN"
echo "UIPATH_CLI_ORGANIZATION_NAME=${{ secrets.UIPATH_ORG_NAME }}"
echo "UIPATH_CLI_ORGANIZATION_ID=${{ secrets.UIPATH_ORG_ID }}"
echo "UIPATH_CLI_TENANT_NAME=${{ secrets.UIPATH_TENANT_NAME }}"
echo "UIPATH_CLI_TENANT_ID=${{ secrets.UIPATH_TENANT_ID }}"
} >> "$GITHUB_ENV"
UIPATH_CLI_ENABLE_ENV_AUTH=true \
UIPATH_CLI_AUTH_TOKEN="$TOKEN" \
UIPATH_CLI_ORGANIZATION_NAME="${{ secrets.UIPATH_ORG_NAME }}" \
UIPATH_CLI_ORGANIZATION_ID="${{ secrets.UIPATH_ORG_ID }}" \
UIPATH_CLI_TENANT_NAME="${{ secrets.UIPATH_TENANT_NAME }}" \
UIPATH_CLI_TENANT_ID="${{ secrets.UIPATH_TENANT_ID }}" \
uip login status --output json
- name: Run smoke tests
env:
SKILLS_REPO_PATH: ${{ github.workspace }}
# Route the agent through AWS Bedrock. The Anthropic workspace
# hit its API usage limit (resets 2026-05-01), so DirectRoute
# is blocked. Bedrock has a separate quota.
#
# ANTHROPIC_API_KEY stays set for the LLM reviewer (lightweight
# token spend — coder_eval's reviewer only supports Anthropic
# direct or UiPath LLM Gateway, not Bedrock). If the shared
# workspace quota is still drained when the reviewer fires,
# orchestrator._run_final_llm_review swallows the error and
# the task still records its deterministic criteria result.
API_BACKEND: bedrock
AWS_BEARER_TOKEN_BEDROCK: ${{ secrets.AWS_BEARER_TOKEN_BEDROCK }}
AWS_REGION: ${{ secrets.AWS_REGION }}
BEDROCK_MODEL: ${{ secrets.BEDROCK_MODEL }}
ANTHROPIC_API_KEY: ${{ secrets.ANTHROPIC_API_KEY }}
working-directory: tests
id: smoke
run: |
echo "Running: coder-eval run ${{ needs.detect.outputs.task_globs }} --tags smoke"
coder-eval run ${{ needs.detect.outputs.task_globs }} \
-e experiments/default.yaml --tags smoke -j 1 -v
continue-on-error: true
- name: Summarize results
if: always()
working-directory: tests
run: |
# Find the most recent run directory (tests may not have produced
# one on catastrophic install failures).
run_dir=$(ls -td runs/*/ 2>/dev/null | head -n 1)
if [ -z "$run_dir" ]; then
echo "::warning::No run directory found — nothing to summarize"
exit 0
fi
echo "Run directory: $run_dir"
echo ""
echo "## Task results"
python - <<'PY'
import json
import pathlib
run_dir = sorted(pathlib.Path('runs').glob('*/'))[-1]
rows = []
for p in sorted(run_dir.rglob('task.json')):
d = json.loads(p.read_text(encoding='utf-8'))
lr = d.get('llm_review') or {}
rows.append((
d.get('task_id', '?'),
d.get('final_status', '?'),
d.get('weighted_score'),
lr.get('score'),
))
if not rows:
print(' (no task.json files found)')
width = max((len(r[0]) for r in rows), default=4)
for task_id, status, score, llm_score in rows:
score_s = f"{score:.2f}" if isinstance(score, (int, float)) else ' —'
llm_s = f"{llm_score:.2f}" if isinstance(llm_score, (int, float)) else ' —'
print(f" {task_id:<{width}} status={status:<8} score={score_s} llm={llm_s}")
PY
echo ""
echo "HTML report artifact: eval-report-${{ github.run_id }}"
echo "Download under the workflow run's Artifacts section and open experiment.html."
- name: Upload HTML / JSON eval report
if: always()
uses: actions/upload-artifact@v4
with:
name: eval-report-${{ github.run_id }}
# Use ** so the pattern still matches the replicate-index segment
# that coder_eval adds to per-task run dirs (runs/<ts>/<variant>/<task>/<NN>/).
path: |
tests/runs/*/experiment.html
tests/runs/*/experiment.md
tests/runs/*/experiment.json
tests/runs/*/experiment.log
tests/runs/*/*/variant.html
tests/runs/*/*/variant.md
tests/runs/*/*/variant.json
tests/runs/**/task.html
tests/runs/**/task.json
tests/runs/**/task.log
if-no-files-found: warn
retention-days: 14
# Quality gate: fail the job if any task's LLM reviewer score is
# below 0.7, independent of deterministic criteria. The reviewer
# catches skill-adherence regressions that file-exists /
# file-contains checks miss (e.g. agent called the wrong subcommand
# but still produced the right output shape). 0.7 (not 0.8) gives
# the qualitative verdict room to dock for minor process issues the
# agent already recovered from without false-failing correct runs.
- name: Enforce LLM reviewer score threshold (>= 0.7)
if: always()
working-directory: tests
env:
REVIEWER_THRESHOLD: "0.7"
# Single-quoted heredoc keeps Python literals intact. Threshold
# is read from the env so we don't have to escape dollar signs.
run: |
python - <<'PY'
import json, os, pathlib, sys
threshold = float(os.environ.get('REVIEWER_THRESHOLD', '0.8'))
run_dirs = sorted(pathlib.Path('runs').glob('*/'))
if not run_dirs:
print('::warning::No run directory — skipping reviewer threshold check')
sys.exit(0)
run_dir = run_dirs[-1]
task_jsons = sorted(run_dir.rglob('task.json'))
failures = []
missing = []
for p in task_jsons:
d = json.loads(p.read_text(encoding='utf-8'))
lr = d.get('llm_review') or {}
score = lr.get('score')
tid = d.get('task_id', p.parent.name)
if not isinstance(score, (int, float)):
missing.append(tid)
continue
if score < threshold:
issues = (lr.get('issues') or '').splitlines()[0][:120]
failures.append((tid, score, issues))
for tid in missing:
print(f'::warning::{tid}: no llm_review (reviewer skipped or errored)')
if failures:
for tid, score, issues in failures:
print(f'::error::{tid}: llm_review score {score:.2f} < {threshold} — {issues}')
print(f'::error::{len(failures)} task(s) below LLM reviewer threshold {threshold}')
sys.exit(1)
print(f'All {len(task_jsons)} task(s) passed llm_review >= {threshold}')
PY
- name: Fail if tests failed
# Also catch `cancelled` (the job timeout firing) so a truncated
# run is not silently reported green.
if: steps.smoke.outcome == 'failure' || steps.smoke.outcome == 'cancelled'
run: exit 1