feat(tests): add smoke tests for uipath-rpa skill #71

Workflow file for this run

.github/workflows/smoke-rpa-skills.yml at 3640397

	name: Smoke RPA Skill Tests (Windows)

	concurrency:
	group: smoke-rpa-skills-${{ github.head_ref \|\| github.ref }}
	cancel-in-progress: true

	on:
	pull_request:
	paths:
	- 'skills/uipath-rpa/**'
	- 'skills/uipath-rpa-legacy/**'
	- 'tests/tasks/uipath-rpa/**'
	- 'tests/tasks/uipath-rpa-legacy/**'
	- 'tests/experiments/**'
	- '.github/workflows/smoke-rpa-skills.yml'
	workflow_dispatch:

	jobs:
	detect:
	runs-on: ubuntu-latest
	name: Detect changed RPA skills
	outputs:
	task_globs: ${{ steps.detect.outputs.task_globs }}
	skip: ${{ steps.detect.outputs.skip }}
	steps:
	- uses: actions/checkout@v4
	with:
	fetch-depth: 0

	- name: Detect changed RPA skills and map to test tasks
	id: detect
	run: \|
	BASE_REF="${{ github.base_ref }}"
	if [ -z "$BASE_REF" ]; then BASE_REF="main"; fi
	CHANGED=$(git diff --name-only origin/$BASE_REF...HEAD)

	# Infra change → run all RPA + rpa-legacy smoke tasks
	if echo "$CHANGED" \| grep -qE '^tests/(experiments\|_shared)/\|^tests/[^/]+\.(py\|yaml\|toml)$\|^\.github/workflows/smoke-rpa-skills\.yml$'; then
	echo "task_globs=tasks/uipath-rpa/*/.yaml tasks/uipath-rpa-legacy/*/.yaml" >> "$GITHUB_OUTPUT"
	echo "Running all RPA smoke tests (test infrastructure changed)"
	exit 0
	fi

	GLOBS=""
	if echo "$CHANGED" \| grep -qE '^(skills/uipath-rpa/\|tests/tasks/uipath-rpa/)'; then
	[ -d "tests/tasks/uipath-rpa" ] && GLOBS="tasks/uipath-rpa/*/.yaml"
	fi
	if echo "$CHANGED" \| grep -qE '^(skills/uipath-rpa-legacy/\|tests/tasks/uipath-rpa-legacy/)'; then
	if [ -d "tests/tasks/uipath-rpa-legacy" ]; then
	[ -n "$GLOBS" ] && GLOBS="$GLOBS tasks/uipath-rpa-legacy/*/.yaml" \|\| GLOBS="tasks/uipath-rpa-legacy/*/.yaml"
	fi
	fi

	if [ -z "$GLOBS" ]; then
	echo "skip=true" >> "$GITHUB_OUTPUT"
	echo "No RPA skill changes detected — skipping"
	else
	echo "task_globs=$GLOBS" >> "$GITHUB_OUTPUT"
	echo "Will test: $GLOBS"
	fi

	smoke:
	needs: detect
	if: needs.detect.outputs.skip != 'true'
	runs-on: windows-latest
	# RPA smoke tasks spin up Helm on each `uip rpa` call (30–60s each). A
	# typical 3-task smoke run with cold Helm + three tasks runs ~25–40
	# min; bump the ceiling so we don't cancel mid-task.
	timeout-minutes: 90
	name: Run RPA skill smoke tests (Windows)
	steps:
	- uses: actions/checkout@v4

	- uses: actions/checkout@v4
	with:
	repository: UiPath/coder_eval
	token: ${{ secrets.GH_PAT }}
	path: .coder_eval

	- uses: actions/setup-python@v5
	with:
	python-version: '3.13'

	- uses: astral-sh/setup-uv@v4

	- uses: actions/setup-node@v4
	with:
	node-version: '20'

	- uses: actions/setup-dotnet@v4
	with:
	dotnet-version: '8.0.x'

	- name: Install coder-eval
	working-directory: .coder_eval
	env:
	UV_EXTRA_INDEX_URL: "https://${{ secrets.UV_INDEX_UIPATH_USERNAME }}:${{ secrets.UV_INDEX_UIPATH_PASSWORD }}@uipath.pkgs.visualstudio.com/_packaging/ml-packages/pypi/simple/"
	run: uv pip install --system .

	- name: Configure NuGet feed for Helm packages
	shell: bash
	run: \|
	dotnet nuget add source \
	"https://uipath.pkgs.visualstudio.com/Public.Feeds/_packaging/UiPath-Internal/nuget/v3/index.json" \
	--name UiPath-Internal \
	--username az \
	--password "${{ secrets.UV_INDEX_UIPATH_PASSWORD }}" \
	--store-password-in-clear-text

	# Install from public npm at @latest. The `--@uipath:registry=`
	# flag forces the @uipath scope to public npm even if some
	# `.npmrc` (runner image, user-level, or future setup-node config)
	# maps it elsewhere — notably the internal GitHub Packages feed,
	# which carries divergent 1.0.0-alpha.* prereleases under the
	# same scope. Plain `--registry=` does NOT bypass scope mappings;
	# only the scope-specific override does. `npm install -g` lands
	# under `<npm-prefix>/@uipath/`, where uipcli's ToolManager
	# discovers tools, so `uip rpa …` / `uip rpa-legacy …` resolve
	# without triggering auto-install.
	- name: Install uip CLI + RPA tools (public npm @latest)
	shell: bash
	run: \|
	set -e
	npm install -g \
	--@uipath:registry=https://registry.npmjs.org/ \
	@uipath/cli@latest @uipath/rpa-tool@latest @uipath/rpa-legacy-tool@latest
	uip --version
	uip tools list --output json

	# Pre-auth uip as a real licensed Studio user via Studio's own e2e
	# auth helper (vendored from Studio/.ci/helm-e2e/oauth-login.mjs).
	# Why this and not client_credentials:
	# * Helm's HelmFeatureGate runs TWO checks: sign-in state AND
	# license SKU entitlement (HelmLicenseSkuFeatureSourceService).
	# A licensed user account carries the Studio SKU and clears
	# both gates; an External App / client_credentials principal
	# typically doesn't.
	# * Studio Desktop scopes (StudioWebBackend, OrchestratorApiUserAccess,
	# LLMGateway, ...) are user-only — not requestable via
	# client_credentials grant at all.
	# * Studio engineers themselves use this exact script to drive
	# Helm e2e; we get free upkeep when login UI changes.
	# The script drives a headless Puppeteer browser through the
	# auth-code+PKCE flow, exchanges the code for tokens, then writes
	# the standard ~/.uipath/.auth file the JS CLI reads on disk
	# (refresh token included — much longer effective lifetime than
	# env-auth's ~1h).
	- name: Install Puppeteer (auth helper dep)
	shell: bash
	run: npm install --no-save puppeteer

	- name: Authenticate uip as licensed Studio user
	id: auth
	shell: bash
	env:
	AUTHORITY: https://alpha.uipath.com
	EMAIL: ${{ secrets.UIPATH_EMAIL }}
	PASSWORD: ${{ secrets.UIPATH_PASSWORD }}
	TENANT: ${{ secrets.UIPATH_TENANT }}
	ORG: ${{ secrets.UIPATH_ORG }}
	# Drop screenshots + DOM dumps here so we can upload them as an
	# artifact for post-mortem on auth failures.
	AUTH_DEBUG_DIR: ${{ github.workspace }}/auth-debug
	run: \|
	set -euo pipefail
	mkdir -p "$AUTH_DEBUG_DIR"
	node .github/scripts/uipath-oauth-login.mjs
	# Smoke check the auth context is recognized. Failure here fails
	# the job — we'd rather not run smoke tasks against a broken
	# login state and chase mysterious sign-in errors per task.
	uip login status --output json

	# Always upload the auth-debug screenshots / dumps so we have a
	# post-mortem artifact even on success (baseline for comparison).
	- name: Upload auth-debug artifacts
	if: always() && steps.auth.outcome != 'skipped'
	uses: actions/upload-artifact@v4
	with:
	name: auth-debug-${{ github.run_id }}
	path: ${{ github.workspace }}/auth-debug
	if-no-files-found: ignore
	retention-days: 7

	- name: Pre-warm Helm (download NuGet package before tests)
	shell: bash
	run: \|
	mkdir -p /tmp/helm-warmup && cd /tmp/helm-warmup
	uip rpa list-instances --output json 2>&1 \|\| true
	taskkill //F //IM UiPath.Studio.Helm.exe 2>/dev/null \|\| true

	- name: Run RPA smoke tests
	env:
	SKILLS_REPO_PATH: ${{ github.workspace }}
	# Route the agent through AWS Bedrock. The Anthropic workspace
	# hit its API usage limit (resets 2026-05-01), so DirectRoute
	# is blocked. Bedrock has a separate quota.
	#
	# ANTHROPIC_API_KEY stays set for the LLM reviewer (lightweight
	# token spend — coder_eval's reviewer only supports Anthropic
	# direct or UiPath LLM Gateway, not Bedrock). If the shared
	# workspace quota is still drained when the reviewer fires,
	# orchestrator._run_final_llm_review swallows the error and
	# the task still records its deterministic criteria result.
	API_BACKEND: bedrock
	AWS_BEARER_TOKEN_BEDROCK: ${{ secrets.AWS_BEARER_TOKEN_BEDROCK }}
	AWS_REGION: ${{ secrets.AWS_REGION }}
	BEDROCK_MODEL: ${{ secrets.BEDROCK_MODEL }}
	ANTHROPIC_API_KEY: ${{ secrets.ANTHROPIC_API_KEY }}
	working-directory: tests
	id: smoke
	shell: bash
	run: \|
	# Run each task separately, killing Helm between tasks so one
	# task's stale Studio state doesn't leak into the next. Track
	# per-task exit codes so one failure doesn't abort the loop
	# but still propagates to the overall step outcome.
	shopt -s globstar nullglob
	overall_exit=0
	for task in ${{ needs.detect.outputs.task_globs }}; do
	echo "--- Killing leftover Helm/Studio processes ---"
	taskkill //F //IM UiPath.Studio.Helm.exe 2>/dev/null \|\| true
	echo "--- Running: $task ---"
	# Per-task Bedrock model override. The Anthropic Sonnet
	# family (4.5 on Bedrock, 4.6 on Anthropic-direct) trips
	# the post-generation content filter when emitting the
	# full legacy XAML baseline (21 namespaces + 16 assembly
	# refs, including PresentationFramework / PresentationCore
	# / WindowsBase / System.Xaml / mscorlib +
	# Microsoft.VisualBasic.Activities). Opus emits it
	# cleanly. coder_eval's BedrockRoute.model overrides
	# task-level model, so the only way to swap per-task is
	# at the env level here.
	task_model="$BEDROCK_MODEL"
	case "$task" in
	rpa-legacy) task_model="eu.anthropic.claude-opus-4-6-v1" ;;
	esac
	if ! BEDROCK_MODEL="$task_model" coder-eval run "$task" \
	-e experiments/default.yaml --tags smoke -j 1 -v; then
	overall_exit=1
	fi
	done
	exit $overall_exit
	continue-on-error: true

	- name: Summarize results
	if: always()
	working-directory: tests
	shell: bash
	run: \|
	run_dir=$(ls -td runs/*/ 2>/dev/null \| head -n 1)
	if [ -z "$run_dir" ]; then
	echo "::warning::No run directory found — nothing to summarize"
	exit 0
	fi
	echo "Run directory: $run_dir"
	echo ""
	echo "## Task results"
	python - <<'PY'
	import json, pathlib
	run_dirs = sorted(pathlib.Path('runs').glob('*/'))
	rows = []
	for run_dir in run_dirs:
	for p in sorted(run_dir.rglob('task.json')):
	d = json.loads(p.read_text(encoding='utf-8'))
	lr = d.get('llm_review') or {}
	rows.append((
	d.get('task_id', '?'),
	d.get('final_status', '?'),
	d.get('weighted_score'),
	lr.get('score'),
	))
	if not rows:
	print(' (no task.json files found)')
	width = max((len(r[0]) for r in rows), default=4)
	for task_id, status, score, llm_score in rows:
	score_s = f"{score:.2f}" if isinstance(score, (int, float)) else ' —'
	llm_s = f"{llm_score:.2f}" if isinstance(llm_score, (int, float)) else ' —'
	print(f" {task_id:<{width}} status={status:<8} score={score_s} llm={llm_s}")
	PY
	echo ""
	echo "HTML report artifact: eval-report-${{ github.run_id }}"
	echo "Download under the workflow run's Artifacts section and open experiment.html."

	- name: Upload HTML / JSON eval report
	if: always()
	uses: actions/upload-artifact@v4
	with:
	name: eval-report-${{ github.run_id }}
	path: \|
	tests/runs/*/experiment.html
	tests/runs/*/experiment.md
	tests/runs/*/experiment.json
	tests/runs/*/experiment.log
	tests/runs///variant.html
	tests/runs///variant.md
	tests/runs///variant.json
	tests/runs/**/task.html
	tests/runs/**/task.json
	tests/runs/**/task.log
	if-no-files-found: warn
	retention-days: 14

	- name: Enforce LLM reviewer score threshold (>= 0.7)
	if: always()
	working-directory: tests
	shell: bash
	env:
	# Reviewer is a qualitative gate and tends to dock for minor
	# process issues the agent recovers from (e.g. late-added
	# project.uiproj, `uip rpa get-errors` exit 1 when the agent
	# falls back to `uip rpa build`). 0.7 keeps the gate meaningful
	# without false-failing correct-but-imperfect runs.
	REVIEWER_THRESHOLD: "0.7"
	run: \|
	python - <<'PY'
	import json, os, pathlib, sys
	threshold = float(os.environ.get('REVIEWER_THRESHOLD', '0.8'))
	run_dirs = sorted(pathlib.Path('runs').glob('*/'))
	if not run_dirs:
	print('::warning::No run directory — skipping reviewer threshold check')
	sys.exit(0)
	task_jsons = []
	for run_dir in run_dirs:
	task_jsons.extend(sorted(run_dir.rglob('task.json')))
	failures = []
	missing = []
	for p in task_jsons:
	d = json.loads(p.read_text(encoding='utf-8'))
	lr = d.get('llm_review') or {}
	score = lr.get('score')
	tid = d.get('task_id', p.parent.name)
	if not isinstance(score, (int, float)):
	missing.append(tid)
	continue
	if score < threshold:
	issues = (lr.get('issues') or '').splitlines()[0][:120]
	failures.append((tid, score, issues))
	for tid in missing:
	print(f'::warning::{tid}: no llm_review (reviewer skipped or errored)')
	if failures:
	for tid, score, issues in failures:
	print(f'::error::{tid}: llm_review score {score:.2f} < {threshold} — {issues}')
	print(f'::error::{len(failures)} task(s) below LLM reviewer threshold {threshold}')
	sys.exit(1)
	print(f'All {len(task_jsons)} task(s) passed llm_review >= {threshold}')
	PY

	- name: Check test results
	if: always()
	working-directory: tests
	shell: bash
	run: \|
	shopt -s globstar nullglob
	mapfile -t TASK_JSONS < <(printf '%s\n' runs/**/task.json)
	if [ ${#TASK_JSONS[@]} -eq 0 ]; then
	echo "::error::No task results found"
	exit 1
	fi
	total=${#TASK_JSONS[@]}
	passed=$(grep -l '"final_status": "SUCCESS"' "${TASK_JSONS[@]}" \| wc -l)
	echo "Results: $passed/$total tasks passed"
	if [ "$passed" -lt "$total" ]; then
	echo "::error::$((total - passed)) task(s) failed — download the eval-report-${{ github.run_id }} artifact and open task.html for each failed task"
	exit 1
	fi

	- name: Fail if smoke step failed or was cancelled
	if: steps.smoke.outcome == 'failure' \|\| steps.smoke.outcome == 'cancelled'
	shell: bash
	run: exit 1

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

feat(tests): add smoke tests for uipath-rpa skill #71

Workflow file

feat(tests): add smoke tests for uipath-rpa skill #71

Uh oh!

Workflow file for this run