Skip to content
Merged
Changes from 1 commit
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
68 changes: 61 additions & 7 deletions .github/workflows/tessl-eval.yml
Original file line number Diff line number Diff line change
Expand Up @@ -127,7 +127,12 @@ jobs:
run: |
PASS=0
FAIL=0
SUMMARY_ROWS=""
SUMMARY_FILE=$(mktemp)
ERRORS_FILE=$(mktemp)
trap 'rm -f "$SUMMARY_FILE" "$ERRORS_FILE"' EXIT

POLL_INTERVAL=30
TIMEOUT=900

for tile_dir in ${{ steps.detect.outputs.dirs }}; do
TILE_NAME=$(basename "$tile_dir")
Expand All @@ -141,14 +146,58 @@ jobs:
if [ "$EXIT_CODE" -ne 0 ]; then
echo "::warning::tessl eval run failed for $TILE_NAME (exit code $EXIT_CODE)"
FAIL=$((FAIL + 1))
SUMMARY_ROWS="$SUMMARY_ROWS| $TILE_NAME | error | ❌ |\n"
else
echo "| $TILE_NAME | error | ❌ |" >> "$SUMMARY_FILE"
echo " ❌ $TILE_NAME: eval run failed (exit code $EXIT_CODE)" >> "$ERRORS_FILE"
continue
fi

# Extract run ID (UUID) from tessl eval run output
RUN_ID=$(echo "$OUTPUT" | grep -oE '[0-9a-f]{8}-[0-9a-f]{4}-[0-9a-f]{4}-[0-9a-f]{4}-[0-9a-f]{12}' | head -1)
if [ -z "$RUN_ID" ]; then
echo "::warning::Could not extract run ID for $TILE_NAME"
FAIL=$((FAIL + 1))
echo "| $TILE_NAME | no run ID | ❌ |" >> "$SUMMARY_FILE"
echo " ❌ $TILE_NAME: could not extract run ID from output" >> "$ERRORS_FILE"
continue
fi

echo "Eval run started for $TILE_NAME with run ID: $RUN_ID"

# Poll for completion
ELAPSED=0
EVAL_STATUS="unknown"
while [ "$ELAPSED" -lt "$TIMEOUT" ]; do
sleep "$POLL_INTERVAL"
ELAPSED=$((ELAPSED + POLL_INTERVAL))

STATUS_OUTPUT=$(tessl eval view "$RUN_ID" --json 2>&1) || true
EVAL_STATUS=$(echo "$STATUS_OUTPUT" | python3 -c "import sys,json; print(json.load(sys.stdin)['data']['attributes']['status'])" 2>/dev/null) || EVAL_STATUS="unknown"

Copy link

Copilot AI Apr 10, 2026

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

STATUS_OUTPUT=$(tessl eval view "$RUN_ID" --json 2>&1) mixes stderr into stdout, which can easily break JSON parsing (and currently causes EVAL_STATUS to stay unknown until timeout). Capture JSON from stdout only, and handle non-zero exit codes / non-JSON responses explicitly (e.g., check exit status; treat auth/invalid run-id as immediate failure, and only retry on transient “not found yet” cases).

Suggested change
STATUS_OUTPUT=$(tessl eval view "$RUN_ID" --json 2>&1) || true
EVAL_STATUS=$(echo "$STATUS_OUTPUT" | python3 -c "import sys,json; print(json.load(sys.stdin)['data']['attributes']['status'])" 2>/dev/null) || EVAL_STATUS="unknown"
STATUS_STDOUT_FILE=$(mktemp)
STATUS_STDERR_FILE=$(mktemp)
VIEW_EXIT_CODE=0
tessl eval view "$RUN_ID" --json >"$STATUS_STDOUT_FILE" 2>"$STATUS_STDERR_FILE" || VIEW_EXIT_CODE=$?
if [ "$VIEW_EXIT_CODE" -eq 0 ]; then
EVAL_STATUS=$(python3 -c "import json,sys; print(json.load(open(sys.argv[1]))['data']['attributes']['status'])" "$STATUS_STDOUT_FILE" 2>/dev/null) || EVAL_STATUS="unknown"
if [ "$EVAL_STATUS" = "unknown" ]; then
echo " [$TILE_NAME] Poll at ${ELAPSED}s: received non-JSON or unexpected JSON from tessl eval view"
fi
else
STATUS_ERROR=$(cat "$STATUS_STDERR_FILE")
echo " [$TILE_NAME] Poll at ${ELAPSED}s: tessl eval view failed (exit code $VIEW_EXIT_CODE): $STATUS_ERROR"
if echo "$STATUS_ERROR" | grep -Eqi 'not[[:space:]-]*found|no such run|run.*not.*found'; then
EVAL_STATUS="unknown"
else
EVAL_STATUS="failed"
fi
fi
rm -f "$STATUS_STDOUT_FILE" "$STATUS_STDERR_FILE"

Copilot uses AI. Check for mistakes.
echo " [$TILE_NAME] Poll at ${ELAPSED}s: status=$EVAL_STATUS"

if [ "$EVAL_STATUS" = "completed" ] || [ "$EVAL_STATUS" = "failed" ]; then
break
fi
done

if [ "$EVAL_STATUS" = "completed" ]; then
PASS=$((PASS + 1))
SUMMARY_ROWS="$SUMMARY_ROWS| $TILE_NAME | passed | ✅ |\n"
# Show detailed results only for successful eval
echo "| $TILE_NAME | passed | ✅ |" >> "$SUMMARY_FILE"
echo "::group::Eval results for $TILE_NAME"
tessl eval view "$RUN_ID" 2>&1 || true
echo "::endgroup::"
elif [ "$EVAL_STATUS" = "failed" ]; then
FAIL=$((FAIL + 1))
echo "| $TILE_NAME | failed | ❌ |" >> "$SUMMARY_FILE"
echo " ❌ $TILE_NAME: eval failed" >> "$ERRORS_FILE"
echo "::group::Eval results for $TILE_NAME"
tessl eval view --last 2>&1 || true
tessl eval view "$RUN_ID" 2>&1 || true
echo "::endgroup::"
else
FAIL=$((FAIL + 1))
echo "| $TILE_NAME | timeout | ❌ |" >> "$SUMMARY_FILE"
echo " ❌ $TILE_NAME: eval timed out after ${TIMEOUT}s (last status: $EVAL_STATUS)" >> "$ERRORS_FILE"
echo "::warning::Eval for $TILE_NAME timed out after ${TIMEOUT}s (last status: $EVAL_STATUS)"
fi
done

Expand All @@ -159,7 +208,7 @@ jobs:
echo ""
echo "| Tile | Result | Status |"
echo "|------|--------|--------|"
echo -e "$SUMMARY_ROWS"
cat "$SUMMARY_FILE"
echo "| **Total** | **$PASS/$TOTAL passed** | $([ "$FAIL" -eq 0 ] && echo '✅' || echo '❌') |"
} >> "$GITHUB_STEP_SUMMARY"

Expand All @@ -170,6 +219,11 @@ jobs:
echo " Total: $TOTAL"
echo " Passed: $PASS"
echo " Failed: $FAIL"
if [ -s "$ERRORS_FILE" ]; then
echo ""
echo " Failed evals:"
cat "$ERRORS_FILE"
fi
echo "============================="

if [ "$FAIL" -gt 0 ]; then
Expand Down