Improve analyze-and-plan skill quality (65% → 94%) #22

Workflow file for this run

.github/workflows/tessl-eval.yml at 2a7e37d

	name: Tessl Skill Eval

	on:
	pull_request:
	branches: [main]

	jobs:
	tessl-eval:
	runs-on: ubuntu-latest
	# Uses the "eval" environment for protection rules (e.g. reviewer approval)
	# to guard the TESSL_TOKEN secret against exfiltration via same-repo branches.
	environment: eval
	steps:
	- uses: actions/checkout@v6
	with:
	fetch-depth: 0
	ref: ${{ github.event.pull_request.head.sha }}

	- name: Check eval trigger
	id: trigger
	run: \|
	LAST_MSG=$(git log -1 --format=%s)
	echo "commit_message=$LAST_MSG"
	if [[ ! "$LAST_MSG" =~ ^eval: ]]; then
	echo "skip=true" >> "$GITHUB_OUTPUT"
	echo "Last commit does not start with 'eval:' — skipping eval."
	else
	echo "skip=false" >> "$GITHUB_OUTPUT"
	echo "Eval triggered by commit: $LAST_MSG"
	fi

	- name: Post skip summary (no eval trigger)
	if: steps.trigger.outputs.skip == 'true'
	run: \|
	{
	echo "## Skill Eval"
	echo ""
	echo "Eval not requested — last commit message does not start with \`eval:\`."
	echo "To trigger evals, create a commit with a message like:"
	echo "\`\`\`"
	echo 'git commit --allow-empty -m "eval: test building-blocks changes"'
	echo "\`\`\`"
	} >> "$GITHUB_STEP_SUMMARY"

	- name: Detect changed skills with tiles
	if: steps.trigger.outputs.skip != 'true'
	id: detect
	run: \|
	CHANGED_FILES=$(git diff --name-only origin/${{ github.base_ref }}...HEAD)

	# Build list of all tile directories (parents of tile.json)
	ALL_TILE_DIRS=$(find skills -name tile.json -exec dirname {} \; 2>/dev/null \| sort)

	if [ -z "$ALL_TILE_DIRS" ]; then
	echo "skip=true" >> "$GITHUB_OUTPUT"
	echo "No tile.json files found in repo."
	else
	# For each changed file, find which tile directory it belongs to
	DIRS=""
	for changed_file in $CHANGED_FILES; do
	for tile_dir in $ALL_TILE_DIRS; do
	case "$changed_file" in
	"$tile_dir"/*)
	# Check not already added
	case " $DIRS " in
	" $tile_dir ") ;;
	*) DIRS="$DIRS $tile_dir" ;;
	esac
	break
	;;
	esac
	done
	done

	DIRS=$(echo "$DIRS" \| xargs) # trim whitespace

	# Check for unmatched changes under skills/
	UNMATCHED=false
	for changed_file in $CHANGED_FILES; do
	case "$changed_file" in
	skills/*)
	MATCHED=false
	for tile_dir in $ALL_TILE_DIRS; do
	case "$changed_file" in
	"$tile_dir"/*) MATCHED=true; break ;;
	esac
	done
	if [ "$MATCHED" = "false" ]; then
	UNMATCHED=true
	break
	fi
	;;
	esac
	done

	if [ "$UNMATCHED" = "true" ]; then
	echo "Unmatched changes under skills/ detected — evaluating all tiles"
	DIRS=$(find skills -name tile.json -exec dirname {} \; 2>/dev/null \| tr "\n" " " \| xargs)
	fi

	if [ -z "$DIRS" ]; then
	echo "skip=true" >> "$GITHUB_OUTPUT"
	echo "Changed files don't belong to any tile — nothing to eval."
	else
	echo "skip=false" >> "$GITHUB_OUTPUT"
	echo "dirs=$DIRS" >> "$GITHUB_OUTPUT"
	echo "Changed tiles: $DIRS"
	fi
	fi

	- name: Post skip summary (no qualifying tiles)
	if: steps.trigger.outputs.skip != 'true' && steps.detect.outputs.skip == 'true'
	run: \|
	{
	echo "## Skill Eval"
	echo ""
	echo "No qualifying skills to evaluate — either no skill files were changed or changed skills have no \`tile.json\`."
	} >> "$GITHUB_STEP_SUMMARY"

	- uses: tesslio/setup-tessl@v2
	if: steps.trigger.outputs.skip != 'true' && steps.detect.outputs.skip != 'true'
	with:
	token: ${{ secrets.TESSL_TOKEN }}

	- name: Run evals
	if: steps.trigger.outputs.skip != 'true' && steps.detect.outputs.skip != 'true'
	run: \|
	PASS=0
	FAIL=0
	SUMMARY_FILE=$(mktemp)
	ERRORS_FILE=$(mktemp)
	STDOUT_TMP=$(mktemp)
	STDERR_TMP=$(mktemp)
	trap 'rm -f "$SUMMARY_FILE" "$ERRORS_FILE" "$STDOUT_TMP" "$STDERR_TMP"' EXIT

	POLL_INTERVAL=30
	TIMEOUT=900

	for tile_dir in ${{ steps.detect.outputs.dirs }}; do
	TILE_NAME=$(basename "$tile_dir")
	echo "::group::Evaluating $TILE_NAME ($tile_dir)"

	EXIT_CODE=0
	OUTPUT=$(tessl eval run "$tile_dir" --workspace adobe 2>&1) \|\| EXIT_CODE=$?
	echo "$OUTPUT"
	echo "::endgroup::"

	if [ "$EXIT_CODE" -ne 0 ]; then
	echo "::warning::tessl eval run failed for $TILE_NAME (exit code $EXIT_CODE)"
	FAIL=$((FAIL + 1))
	echo "\| $TILE_NAME \| error \| ❌ \|" >> "$SUMMARY_FILE"
	echo " ❌ $TILE_NAME: eval run failed (exit code $EXIT_CODE)" >> "$ERRORS_FILE"
	continue
	fi

	# Extract run ID (UUID) from tessl eval run output
	RUN_ID=$(echo "$OUTPUT" \| grep -oE '[0-9a-f]{8}-[0-9a-f]{4}-[0-9a-f]{4}-[0-9a-f]{4}-[0-9a-f]{12}' \| head -1)
	if [ -z "$RUN_ID" ]; then
	echo "::warning::Could not extract run ID for $TILE_NAME"
	FAIL=$((FAIL + 1))
	echo "\| $TILE_NAME \| no run ID \| ❌ \|" >> "$SUMMARY_FILE"
	echo " ❌ $TILE_NAME: could not extract run ID from output" >> "$ERRORS_FILE"
	continue
	fi

	echo "Eval run started for $TILE_NAME with run ID: $RUN_ID"

	# Poll for completion
	ELAPSED=0
	EVAL_STATUS="unknown"
	while [ "$ELAPSED" -lt "$TIMEOUT" ]; do
	sleep "$POLL_INTERVAL"
	ELAPSED=$((ELAPSED + POLL_INTERVAL))

	# Capture stdout and stderr separately to avoid breaking JSON parsing
	VIEW_EXIT=0
	tessl eval view "$RUN_ID" --json >"$STDOUT_TMP" 2>"$STDERR_TMP" \|\| VIEW_EXIT=$?

	if [ "$VIEW_EXIT" -ne 0 ]; then
	STDERR_CONTENT=$(cat "$STDERR_TMP")
	echo " [$TILE_NAME] tessl eval view exited $VIEW_EXIT: $STDERR_CONTENT"
	if echo "$STDERR_CONTENT" \| grep -qi "not found"; then
	# Transient: run not yet visible, keep polling
	EVAL_STATUS="unknown"
	else
	# Permanent failure
	echo "::warning::tessl eval view failed for $TILE_NAME (exit $VIEW_EXIT): $STDERR_CONTENT"
	EVAL_STATUS="failed"
	break
	fi
	else
	EVAL_STATUS=$(python3 -c "import sys,json; print(json.load(sys.stdin)['data']['attributes']['status'])" <"$STDOUT_TMP" 2>/dev/null) \|\| {
	echo " [$TILE_NAME] Warning: could not parse JSON from stdout"
	EVAL_STATUS="unknown"
	}
	fi

	echo " [$TILE_NAME] Poll at ${ELAPSED}s: status=$EVAL_STATUS"

	if [ "$EVAL_STATUS" = "completed" ] \|\| [ "$EVAL_STATUS" = "failed" ]; then
	break
	fi
	done

	if [ "$EVAL_STATUS" = "completed" ]; then
	PASS=$((PASS + 1))
	echo "\| $TILE_NAME \| passed \| ✅ \|" >> "$SUMMARY_FILE"
	echo "::group::Eval results for $TILE_NAME"
	tessl eval view "$RUN_ID" 2>&1 \|\| true
	echo "::endgroup::"
	elif [ "$EVAL_STATUS" = "failed" ]; then
	FAIL=$((FAIL + 1))
	echo "\| $TILE_NAME \| failed \| ❌ \|" >> "$SUMMARY_FILE"
	echo " ❌ $TILE_NAME: eval failed" >> "$ERRORS_FILE"
	echo "::group::Eval results for $TILE_NAME"
	tessl eval view "$RUN_ID" 2>&1 \|\| true
	echo "::endgroup::"
	else
	FAIL=$((FAIL + 1))
	echo "\| $TILE_NAME \| timeout \| ❌ \|" >> "$SUMMARY_FILE"
	echo " ❌ $TILE_NAME: eval timed out after ${TIMEOUT}s (last status: $EVAL_STATUS)" >> "$ERRORS_FILE"
	echo "::warning::Eval for $TILE_NAME timed out after ${TIMEOUT}s (last status: $EVAL_STATUS)"
	fi
	done

	TOTAL=$((PASS + FAIL))

	{
	echo "## Skill Eval"
	echo ""
	echo "\| Tile \| Result \| Status \|"
	echo "\|------\|--------\|--------\|"
	cat "$SUMMARY_FILE"
	echo "\| Total \| $PASS/$TOTAL passed \| $([ "$FAIL" -eq 0 ] && echo '✅' \|\| echo '❌') \|"
	} >> "$GITHUB_STEP_SUMMARY"

	echo ""
	echo "============================="
	echo " Skill Eval Summary"
	echo "============================="
	echo " Total: $TOTAL"
	echo " Passed: $PASS"
	echo " Failed: $FAIL"
	if [ -s "$ERRORS_FILE" ]; then
	echo ""
	echo " Failed evals:"
	cat "$ERRORS_FILE"
	fi
	echo "============================="

	if [ "$FAIL" -gt 0 ]; then
	echo "::error::$FAIL tile(s) failed evaluation"
	exit 1
	fi

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

Improve analyze-and-plan skill quality (65% → 94%) #22

Workflow file

Improve analyze-and-plan skill quality (65% → 94%) #22

Uh oh!

Workflow file for this run