feat(tests): add smoke tests for uipath-rpa skill #761

Workflow file for this run

.github/workflows/smoke-skills.yml at 3640397

	name: Smoke Skill Tests

	concurrency:
	group: smoke-skills-${{ github.head_ref \|\| github.ref }}
	cancel-in-progress: true

	on:
	pull_request:
	paths:
	- 'skills/*/SKILL.md'
	- 'skills//references/*'
	- 'tests/**'
	workflow_dispatch:

	jobs:
	detect:
	runs-on: ubuntu-latest
	name: Detect changed skills
	outputs:
	task_globs: ${{ steps.detect.outputs.task_globs }}
	skip: ${{ steps.detect.outputs.skip }}
	untested_skills: ${{ steps.detect.outputs.untested_skills }}
	steps:
	- uses: actions/checkout@v4
	with:
	fetch-depth: 0

	- name: Detect changed skills and map to test tasks
	id: detect
	run: \|
	CHANGED=$(git diff --name-only origin/${{ github.base_ref }}...HEAD)

	# RPA + rpa-legacy skills run on the Windows workflow
	# (smoke-rpa-skills.yml). Helm/Studio only exists there, and
	# the full lifecycle (`uip rpa` / `uip rpa-legacy` CLI) is
	# what's being exercised — so we skip those tasks here.
	WINDOWS_SKILLS="uipath-rpa uipath-rpa-legacy"

	# If test infrastructure changed, run all smoke tests except
	# the Windows-only ones.
	if echo "$CHANGED" \| grep -qE '^tests/(experiments\|_shared)/\|^tests/[^/]+\.(py\|yaml\|toml)$'; then
	shopt -s globstar nullglob
	GLOBS=""
	for f in tests/tasks/*/.yaml; do
	case "$f" in
	tests/tasks/uipath-rpa/\|tests/tasks/uipath-rpa-legacy/) continue ;;
	esac
	GLOBS="$GLOBS ${f#tests/}"
	done
	GLOBS=$(echo "$GLOBS" \| xargs)
	if [ -z "$GLOBS" ]; then
	echo "skip=true" >> "$GITHUB_OUTPUT"
	echo "No non-Windows smoke tasks to run."
	else
	echo "task_globs=$GLOBS" >> "$GITHUB_OUTPUT"
	echo "Running all non-Windows smoke tests (test infrastructure changed)"
	fi
	exit 0
	fi

	# Extract unique skill names from changed paths
	SKILLS=$(echo "$CHANGED" \| grep '^skills/' \| sed 's\|skills/$[^/]$/.\|\1\|' \| sort -u)

	# Also include skills whose test tasks changed
	TEST_SKILLS=$(echo "$CHANGED" \| grep '^tests/tasks/' \| sed 's\|tests/tasks/$[^/]$/.\|\1\|' \| sort -u)
	SKILLS=$(printf '%s\n%s' "$SKILLS" "$TEST_SKILLS" \| sort -u \| grep -v '^$' \|\| true)

	if [ -z "$SKILLS" ]; then
	echo "skip=true" >> "$GITHUB_OUTPUT"
	echo "No skill changes detected — skipping smoke tests"
	exit 0
	fi

	# Build task glob pattern for changed skills that have tests.
	# Skip the Windows-only RPA skills — they run on smoke-rpa-skills.yml.
	GLOBS=""
	UNTESTED=""
	for skill in $SKILLS; do
	if echo "$WINDOWS_SKILLS" \| grep -qw "$skill"; then
	echo "Skipping $skill (runs on Windows — see smoke-rpa-skills.yml)"
	continue
	fi
	if [ -d "tests/tasks/$skill" ]; then
	if [ -n "$GLOBS" ]; then
	GLOBS="$GLOBS tasks/$skill/*/.yaml"
	else
	GLOBS="tasks/$skill/*/.yaml"
	fi
	echo "Will test: $skill"
	else
	if [ -n "$UNTESTED" ]; then
	UNTESTED="$UNTESTED, $skill"
	else
	UNTESTED="$skill"
	fi
	echo "No tests for: $skill (skipping)"
	fi
	done

	if [ -n "$UNTESTED" ]; then
	echo "untested_skills=$UNTESTED" >> "$GITHUB_OUTPUT"
	fi

	if [ -z "$GLOBS" ]; then
	echo "skip=true" >> "$GITHUB_OUTPUT"
	echo "::warning::Changed skills have no smoke tests: $UNTESTED"
	else
	echo "task_globs=$GLOBS" >> "$GITHUB_OUTPUT"
	fi

	warn-untested:
	needs: detect
	if: needs.detect.outputs.untested_skills != ''
	runs-on: ubuntu-latest
	name: Warn about untested skills
	permissions:
	pull-requests: write
	steps:
	- uses: actions/github-script@v7
	with:
	script: \|
	const skills = '${{ needs.detect.outputs.untested_skills }}';
	const body = `⚠️ Smoke test coverage gap — the following changed skills have no tests under \`tests/tasks/\`:\n\n${skills.split(', ').map(s => '- `' + s + '`').join('\n')}\n\nConsider adding smoke tests before merging.`;

	// Avoid duplicate comments on repeated pushes
	const { data: comments } = await github.rest.issues.listComments({
	owner: context.repo.owner,
	repo: context.repo.repo,
	issue_number: context.issue.number,
	});
	const existing = comments.find(c => c.body.includes('Smoke test coverage gap'));
	if (existing) {
	await github.rest.issues.updateComment({
	owner: context.repo.owner,
	repo: context.repo.repo,
	comment_id: existing.id,
	body,
	});
	} else {
	await github.rest.issues.createComment({
	owner: context.repo.owner,
	repo: context.repo.repo,
	issue_number: context.issue.number,
	body,
	});
	}

	e2e:
	needs: detect
	if: needs.detect.outputs.skip != 'true'
	runs-on: ubuntu-latest
	# 18 smoke tasks at ~1–2 min each + env setup — 30 min was getting
	# cancelled mid-run. 60 min gives headroom even if a task retries.
	timeout-minutes: 60
	name: Run skill smoke tests
	steps:
	- uses: actions/checkout@v4

	- uses: actions/checkout@v4
	with:
	repository: UiPath/coder_eval
	token: ${{ secrets.GH_PAT }}
	path: .coder_eval

	- uses: actions/setup-python@v5
	with:
	python-version: '3.13'

	- uses: astral-sh/setup-uv@v4

	- uses: actions/setup-node@v4
	with:
	node-version: '20'

	- name: Install coder-eval
	working-directory: .coder_eval
	env:
	UV_EXTRA_INDEX_URL: "https://${{ secrets.UV_INDEX_UIPATH_USERNAME }}:${{ secrets.UV_INDEX_UIPATH_PASSWORD }}@uipath.pkgs.visualstudio.com/_packaging/ml-packages/pypi/simple/"
	run: uv pip install --system .

	# Install from public npm at @latest. `--@uipath:registry=`
	# forces the @uipath scope to public npm regardless of any
	# `.npmrc` scope mapping (e.g., to the internal GitHub Packages
	# feed, which carries divergent 1.0.0-alpha.* prereleases). See
	# smoke-rpa-skills.yml for the full rationale.
	- name: Install uip CLI (public npm @latest)
	run: \|
	npm install -g \
	--@uipath:registry=https://registry.npmjs.org/ \
	@uipath/cli@latest
	uip --version

	# Pre-auth uip via the CLI's documented env-var bypass so non-RPA
	# smoke tasks (orchestrator / integration-service / data-fabric / ...)
	# have a logged-in CLI without the agent ever calling `uip login`.
	# See smoke-rpa-skills.yml for the chain rationale; this Linux job
	# doesn't go through Helm, but the same UIPATH_CLI_* env vars
	# satisfy the JS CLI's general auth state too.
	- name: Mint UiPath service-account token + enable env-auth
	env:
	UIPATH_CLIENT_ID: ${{ secrets.UIPATH_CLIENT_ID }}
	UIPATH_CLIENT_SECRET: ${{ secrets.UIPATH_CLIENT_SECRET }}
	run: \|
	set -euo pipefail
	TOKEN=$(curl -fsS -X POST \
	"https://alpha.uipath.com/identity_/connect/token" \
	-H "Content-Type: application/x-www-form-urlencoded" \
	-d "grant_type=client_credentials" \
	-d "client_id=$UIPATH_CLIENT_ID" \
	-d "client_secret=$UIPATH_CLIENT_SECRET" \
	-d "scope=OR.Default OR.Execution OR.Robots OR.Machines.Read" \
	\| python -c "import sys,json;print(json.load(sys.stdin)['access_token'])")
	echo "::add-mask::$TOKEN"
	{
	echo "UIPATH_CLI_ENABLE_ENV_AUTH=true"
	echo "UIPATH_CLI_AUTH_TOKEN=$TOKEN"
	echo "UIPATH_CLI_ORGANIZATION_NAME=${{ secrets.UIPATH_ORG_NAME }}"
	echo "UIPATH_CLI_ORGANIZATION_ID=${{ secrets.UIPATH_ORG_ID }}"
	echo "UIPATH_CLI_TENANT_NAME=${{ secrets.UIPATH_TENANT_NAME }}"
	echo "UIPATH_CLI_TENANT_ID=${{ secrets.UIPATH_TENANT_ID }}"
	} >> "$GITHUB_ENV"
	UIPATH_CLI_ENABLE_ENV_AUTH=true \
	UIPATH_CLI_AUTH_TOKEN="$TOKEN" \
	UIPATH_CLI_ORGANIZATION_NAME="${{ secrets.UIPATH_ORG_NAME }}" \
	UIPATH_CLI_ORGANIZATION_ID="${{ secrets.UIPATH_ORG_ID }}" \
	UIPATH_CLI_TENANT_NAME="${{ secrets.UIPATH_TENANT_NAME }}" \
	UIPATH_CLI_TENANT_ID="${{ secrets.UIPATH_TENANT_ID }}" \
	uip login status --output json

	- name: Run smoke tests
	env:
	SKILLS_REPO_PATH: ${{ github.workspace }}
	# Route the agent through AWS Bedrock. The Anthropic workspace
	# hit its API usage limit (resets 2026-05-01), so DirectRoute
	# is blocked. Bedrock has a separate quota.
	#
	# ANTHROPIC_API_KEY stays set for the LLM reviewer (lightweight
	# token spend — coder_eval's reviewer only supports Anthropic
	# direct or UiPath LLM Gateway, not Bedrock). If the shared
	# workspace quota is still drained when the reviewer fires,
	# orchestrator._run_final_llm_review swallows the error and
	# the task still records its deterministic criteria result.
	API_BACKEND: bedrock
	AWS_BEARER_TOKEN_BEDROCK: ${{ secrets.AWS_BEARER_TOKEN_BEDROCK }}
	AWS_REGION: ${{ secrets.AWS_REGION }}
	BEDROCK_MODEL: ${{ secrets.BEDROCK_MODEL }}
	ANTHROPIC_API_KEY: ${{ secrets.ANTHROPIC_API_KEY }}
	working-directory: tests
	id: smoke
	run: \|
	echo "Running: coder-eval run ${{ needs.detect.outputs.task_globs }} --tags smoke"
	coder-eval run ${{ needs.detect.outputs.task_globs }} \
	-e experiments/default.yaml --tags smoke -j 1 -v
	continue-on-error: true

	- name: Summarize results
	if: always()
	working-directory: tests
	run: \|
	# Find the most recent run directory (tests may not have produced
	# one on catastrophic install failures).
	run_dir=$(ls -td runs/*/ 2>/dev/null \| head -n 1)
	if [ -z "$run_dir" ]; then
	echo "::warning::No run directory found — nothing to summarize"
	exit 0
	fi
	echo "Run directory: $run_dir"
	echo ""
	echo "## Task results"
	python - <<'PY'
	import json
	import pathlib
	run_dir = sorted(pathlib.Path('runs').glob('*/'))[-1]
	rows = []
	for p in sorted(run_dir.rglob('task.json')):
	d = json.loads(p.read_text(encoding='utf-8'))
	lr = d.get('llm_review') or {}
	rows.append((
	d.get('task_id', '?'),
	d.get('final_status', '?'),
	d.get('weighted_score'),
	lr.get('score'),
	))
	if not rows:
	print(' (no task.json files found)')
	width = max((len(r[0]) for r in rows), default=4)
	for task_id, status, score, llm_score in rows:
	score_s = f"{score:.2f}" if isinstance(score, (int, float)) else ' —'
	llm_s = f"{llm_score:.2f}" if isinstance(llm_score, (int, float)) else ' —'
	print(f" {task_id:<{width}} status={status:<8} score={score_s} llm={llm_s}")
	PY
	echo ""
	echo "HTML report artifact: eval-report-${{ github.run_id }}"
	echo "Download under the workflow run's Artifacts section and open experiment.html."

	- name: Upload HTML / JSON eval report
	if: always()
	uses: actions/upload-artifact@v4
	with:
	name: eval-report-${{ github.run_id }}
	# Use ** so the pattern still matches the replicate-index segment
	# that coder_eval adds to per-task run dirs (runs/<ts>/<variant>/<task>/<NN>/).
	path: \|
	tests/runs/*/experiment.html
	tests/runs/*/experiment.md
	tests/runs/*/experiment.json
	tests/runs/*/experiment.log
	tests/runs///variant.html
	tests/runs///variant.md
	tests/runs///variant.json
	tests/runs/**/task.html
	tests/runs/**/task.json
	tests/runs/**/task.log
	if-no-files-found: warn
	retention-days: 14

	# Quality gate: fail the job if any task's LLM reviewer score is
	# below 0.7, independent of deterministic criteria. The reviewer
	# catches skill-adherence regressions that file-exists /
	# file-contains checks miss (e.g. agent called the wrong subcommand
	# but still produced the right output shape). 0.7 (not 0.8) gives
	# the qualitative verdict room to dock for minor process issues the
	# agent already recovered from without false-failing correct runs.
	- name: Enforce LLM reviewer score threshold (>= 0.7)
	if: always()
	working-directory: tests
	env:
	REVIEWER_THRESHOLD: "0.7"
	# Single-quoted heredoc keeps Python literals intact. Threshold
	# is read from the env so we don't have to escape dollar signs.
	run: \|
	python - <<'PY'
	import json, os, pathlib, sys
	threshold = float(os.environ.get('REVIEWER_THRESHOLD', '0.8'))
	run_dirs = sorted(pathlib.Path('runs').glob('*/'))
	if not run_dirs:
	print('::warning::No run directory — skipping reviewer threshold check')
	sys.exit(0)
	run_dir = run_dirs[-1]
	task_jsons = sorted(run_dir.rglob('task.json'))
	failures = []
	missing = []
	for p in task_jsons:
	d = json.loads(p.read_text(encoding='utf-8'))
	lr = d.get('llm_review') or {}
	score = lr.get('score')
	tid = d.get('task_id', p.parent.name)
	if not isinstance(score, (int, float)):
	missing.append(tid)
	continue
	if score < threshold:
	issues = (lr.get('issues') or '').splitlines()[0][:120]
	failures.append((tid, score, issues))
	for tid in missing:
	print(f'::warning::{tid}: no llm_review (reviewer skipped or errored)')
	if failures:
	for tid, score, issues in failures:
	print(f'::error::{tid}: llm_review score {score:.2f} < {threshold} — {issues}')
	print(f'::error::{len(failures)} task(s) below LLM reviewer threshold {threshold}')
	sys.exit(1)
	print(f'All {len(task_jsons)} task(s) passed llm_review >= {threshold}')
	PY

	- name: Fail if tests failed
	# Also catch `cancelled` (the job timeout firing) so a truncated
	# run is not silently reported green.
	if: steps.smoke.outcome == 'failure' \|\| steps.smoke.outcome == 'cancelled'
	run: exit 1

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

feat(tests): add smoke tests for uipath-rpa skill #761

Workflow file

feat(tests): add smoke tests for uipath-rpa skill #761

Uh oh!

Workflow file for this run