Skip to content

Commit 24bd9bb

Browse files
jangel97claude
andcommitted
feat: add post-task evaluation to determine CI pass/fail
After the main Claude session exits, a second lightweight Claude call analyzes the session log to determine whether the task completed successfully. Writes SUCCESS or FAILURE to a result file and sets the job exit code accordingly. Opt-in via CLAUDIO_RESULT_FILE. Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
1 parent 80c22b2 commit 24bd9bb

1 file changed

Lines changed: 90 additions & 2 deletions

File tree

entrypoint.sh

Lines changed: 90 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -9,6 +9,25 @@ if [ "$DEBUG" = "true" ]; then
99
fi
1010

1111
ADC_PATH="${GOOGLE_APPLICATION_CREDENTIALS:-${HOME}/.config/gcloud/application_default_credentials.json}"
12+
CLAUDIO_RESULT_FILE="${CLAUDIO_RESULT_FILE:-}"
13+
CLAUDIO_EVALUATION_PROMPT="${CLAUDIO_EVALUATION_PROMPT:-$(cat <<'EOF'
14+
Read this Claude Code session log.
15+
16+
Determine whether the original task was FULLY completed successfully.
17+
18+
Return ONLY one of:
19+
- SUCCESS
20+
- FAILURE: <short reason>
21+
22+
Mark the task as FAILURE if:
23+
- the agent abandoned the task
24+
- commands or tool calls failed without recovery
25+
- tests failed
26+
- the requested work was only partially completed
27+
- the final state is uncertain
28+
- the task could not be verified as complete
29+
EOF
30+
)}"
1231

1332
###################
1433
#### Functions ####
@@ -25,6 +44,46 @@ check_adc() {
2544
return 1
2645
}
2746

47+
validate_result() {
48+
echo "=== Validating Claudio result ==="
49+
50+
if [ ! -f "${CLAUDIO_RESULT_FILE}" ]; then
51+
echo "ERROR: Claudio did not produce a result file"
52+
echo "ERROR: Task status is unknown"
53+
return 1
54+
fi
55+
56+
local result
57+
result="$(head -n1 "${CLAUDIO_RESULT_FILE}" | tr -d '\r')"
58+
59+
echo "Result: ${result}"
60+
61+
case "${result}" in
62+
SUCCESS)
63+
echo "=== Claudio task completed successfully ==="
64+
return 0
65+
;;
66+
67+
FAILURE:*)
68+
echo "=== Claudio task reported failure ==="
69+
echo "${result}"
70+
return 1
71+
;;
72+
73+
*)
74+
echo "ERROR: Invalid result format"
75+
echo "Expected:"
76+
echo " SUCCESS"
77+
echo "or:"
78+
echo " FAILURE: <reason>"
79+
echo
80+
echo "Received:"
81+
echo " ${result}"
82+
return 1
83+
;;
84+
esac
85+
}
86+
2887
##############
2988
#### Main ####
3089
##############
@@ -68,10 +127,19 @@ done
68127

69128
# --- Non-streaming mode: transparent passthrough ---
70129
if [ "${CLAUDIO_STREAM:-}" != "1" ]; then
130+
if [ -n "${CLAUDIO_RESULT_FILE}" ]; then
131+
echo "ERROR: CLAUDIO_RESULT_FILE requires streaming mode (CLAUDIO_STREAM=1) to evaluate results"
132+
exit 1
133+
fi
71134
exec claude "$@"
72135
fi
73136

74137
# --- CI streaming mode ---
138+
if [ -n "${CLAUDIO_RESULT_FILE}" ] && [ -z "${CLAUDIO_LOG_FILE:-}" ]; then
139+
CLAUDIO_LOG_FILE="$(mktemp /tmp/claudio-session.XXXXXX.log)"
140+
echo "CLAUDIO_LOG_FILE not set; defaulting to ${CLAUDIO_LOG_FILE} for result evaluation"
141+
fi
142+
75143
stream_args=()
76144
[ -n "${CLAUDIO_LOG_FILE:-}" ] && stream_args+=(--log-file "$CLAUDIO_LOG_FILE")
77145
[ -n "${CLAUDIO_WRAP:-}" ] && stream_args+=(--wrap "$CLAUDIO_WRAP")
@@ -110,5 +178,25 @@ wait "$claude_pid" 2>/dev/null && claude_rc=0 || claude_rc=$?
110178

111179
# 143 = SIGTERM (expected when we kill claude after stream ends)
112180
if [ "$stream_rc" -ne 0 ]; then exit "$stream_rc"; fi
113-
if [ "$claude_rc" -eq 0 ] || [ "$claude_rc" -eq 143 ]; then exit 0; fi
114-
exit "$claude_rc"
181+
if [ "$claude_rc" -ne 0 ] && [ "$claude_rc" -ne 143 ]; then exit "$claude_rc"; fi
182+
183+
# Result check: use a second Claude call to evaluate whether the task
184+
# actually completed successfully based on the session log.
185+
if [ -n "${CLAUDIO_RESULT_FILE}" ] && [ -s "${CLAUDIO_LOG_FILE:-}" ]; then
186+
echo "=== Evaluating task result ==="
187+
188+
if ! tail -c "${CLAUDIO_RESULT_MAX_CHARS:-50000}" "${CLAUDIO_LOG_FILE}" | \
189+
claude -p "${CLAUDIO_EVALUATION_PROMPT}" \
190+
--model "${CLAUDIO_EVALUATION_MODEL:-claude-haiku-4-5-20251001}" \
191+
--no-session-persistence \
192+
> "${CLAUDIO_RESULT_FILE}"
193+
then
194+
echo "ERROR: Failed to evaluate task result"
195+
exit 1
196+
fi
197+
198+
validate_result
199+
exit $?
200+
fi
201+
202+
exit 0

0 commit comments

Comments
 (0)