Skip to content

Commit 509e721

Browse files
authored
add retries and timeouts (#51)
Signed-off-by: Derek Misler <derek.misler@docker.com>
1 parent 73c2715 commit 509e721

File tree

7 files changed

+149
-61
lines changed

7 files changed

+149
-61
lines changed

.github/workflows/release.yml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -52,7 +52,7 @@ jobs:
5252

5353
- name: Install cagent
5454
env:
55-
CAGENT_VERSION: "v1.23.4"
55+
CAGENT_VERSION: "v1.23.6"
5656
run: |
5757
set -e
5858
curl -fL -o cagent \

.github/workflows/review-pr.yml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -53,7 +53,7 @@ on:
5353
description: "Version of cagent to use"
5454
required: false
5555
type: string
56-
default: "v1.23.4"
56+
default: "v1.23.6"
5757
add-prompt-files:
5858
description: "Comma-separated list of files to append to the prompt (e.g., 'AGENTS.md,CLAUDE.md')"
5959
required: false

README.md

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -98,7 +98,7 @@ See the [full PR Review documentation](review-pr/README.md) for more details.
9898
agent: docker/code-analyzer
9999
prompt: "Analyze this codebase"
100100
anthropic-api-key: ${{ secrets.ANTHROPIC_API_KEY }}
101-
cagent-version: v1.23.4
101+
cagent-version: v1.23.6
102102
mcp-gateway: true # Set to true to install mcp-gateway
103103
mcp-gateway-version: v0.22.0
104104
yolo: false # Require manual approval
@@ -142,7 +142,7 @@ See the [full PR Review documentation](review-pr/README.md) for more details.
142142
| --------------------- | ------------------------------------------------------------------------------------ | -------- | ------------------------------- |
143143
| `agent` | Agent identifier (e.g., `docker/code-analyzer`) or path to `.yaml` file | Yes | - |
144144
| `prompt` | Prompt to pass to the agent | No | - |
145-
| `cagent-version` | Version of cagent to use | No | `v1.23.4` |
145+
| `cagent-version` | Version of cagent to use | No | `v1.23.6` |
146146
| `mcp-gateway` | Install mcp-gateway (`true`/`false`) | No | `false` |
147147
| `mcp-gateway-version` | Version of mcp-gateway to use (specifying this will enable mcp-gateway installation) | No | `v0.22.0` |
148148
| `anthropic-api-key` | Anthropic API key for Claude models (at least one API key required) | No* | - |
@@ -161,7 +161,7 @@ See the [full PR Review documentation](review-pr/README.md) for more details.
161161
| `yolo` | Auto-approve all prompts (`true`/`false`) | No | `true` |
162162
| `quiet` | Suppress verbose tool call output (`true`/`false`). Set to `false` for debugging. | No | `true` |
163163
| `max-retries` | Maximum number of retries on failure (0 = no retries) | No | `2` |
164-
| `retry-delay` | Seconds to wait between retries | No | `5` |
164+
| `retry-delay` | Base delay in seconds between retries (doubles each attempt) | No | `5` |
165165
| `extra-args` | Additional arguments to pass to `cagent run` | No | - |
166166
| `add-prompt-files` | Comma-separated list of files to append to the prompt (e.g., `AGENTS.md,CLAUDE.md`) | No | - |
167167

action.yml

Lines changed: 59 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -15,7 +15,7 @@ inputs:
1515
cagent-version:
1616
description: "Version of cagent to use"
1717
required: false
18-
default: "v1.23.4"
18+
default: "v1.23.6"
1919
mcp-gateway:
2020
description: "Install mcp-gateway (true/false)"
2121
required: false
@@ -74,6 +74,14 @@ inputs:
7474
description: "Suppress verbose tool call output (true/false). Set to false for debugging."
7575
required: false
7676
default: "true"
77+
max-retries:
78+
description: "Maximum number of retries on failure (0 = no retries)"
79+
required: false
80+
default: "2"
81+
retry-delay:
82+
description: "Base delay in seconds between retries (doubles each attempt)"
83+
required: false
84+
default: "5"
7785
extra-args:
7886
description: "Additional arguments to pass to cagent run"
7987
required: false
@@ -426,6 +434,8 @@ runs:
426434
MCP_INSTALLED: ${{ steps.setup-binaries.outputs.mcp-installed }}
427435
QUIET: ${{ inputs.quiet }}
428436
ADD_PROMPT_FILES: ${{ inputs.add-prompt-files }}
437+
MAX_RETRIES: ${{ inputs.max-retries }}
438+
RETRY_DELAY: ${{ inputs.retry-delay }}
429439
TELEMETRY_TAGS: "source=github-actions,repo=${{ github.repository }},workflow=${{ github.workflow }},run_id=${{ github.run_id }}"
430440
run: |
431441
set -e
@@ -505,19 +515,56 @@ runs:
505515
# Track execution time
506516
START_TIME=$(date +%s)
507517
508-
# SECURE: Direct execution with quoted arguments (no eval!)
509-
set +e # Don't exit on command failure
510-
if [ "$TIMEOUT" != "0" ]; then
511-
printf '%s\n' "$PROMPT_INPUT" | timeout "$TIMEOUT" "$GITHUB_WORKSPACE/cagent" "${ARGS[@]}" 2>&1 | tee "$OUTPUT_FILE"
512-
EXIT_CODE=$?
518+
# Retry loop with exponential backoff
519+
ATTEMPT=0
520+
CURRENT_DELAY="$RETRY_DELAY"
521+
EXIT_CODE=1
522+
523+
while true; do
524+
ATTEMPT=$((ATTEMPT + 1))
525+
526+
if [ "$ATTEMPT" -gt 1 ]; then
527+
echo "🔄 Retry attempt $((ATTEMPT - 1)) of $MAX_RETRIES (waiting ${CURRENT_DELAY}s)..."
528+
sleep "$CURRENT_DELAY"
529+
CURRENT_DELAY=$((CURRENT_DELAY * 2))
530+
# Reset output file for retry
531+
> "$OUTPUT_FILE"
532+
fi
533+
534+
# SECURE: Direct execution with quoted arguments (no eval!)
535+
set +e # Don't exit on command failure
536+
# Pipeline: printf | [timeout] cagent | tee
537+
# PIPESTATUS: [0]=printf [1]=cagent/timeout [2]=tee
538+
if [ "$TIMEOUT" != "0" ]; then
539+
printf '%s\n' "$PROMPT_INPUT" | timeout "$TIMEOUT" "$GITHUB_WORKSPACE/cagent" "${ARGS[@]}" 2>&1 | tee "$OUTPUT_FILE"
540+
EXIT_CODE=${PIPESTATUS[1]}
541+
if [ $EXIT_CODE -eq 124 ]; then
542+
echo "::error::Agent execution timed out after $TIMEOUT seconds"
543+
fi
544+
else
545+
printf '%s\n' "$PROMPT_INPUT" | "$GITHUB_WORKSPACE/cagent" "${ARGS[@]}" 2>&1 | tee "$OUTPUT_FILE"
546+
EXIT_CODE=${PIPESTATUS[1]}
547+
fi
548+
set -e
549+
550+
# Success — no retry needed
551+
if [ $EXIT_CODE -eq 0 ]; then
552+
break
553+
fi
554+
555+
# Timeout (124) — don't retry, would just timeout again
513556
if [ $EXIT_CODE -eq 124 ]; then
514-
echo "::error::Agent execution timed out after $TIMEOUT seconds"
557+
break
515558
fi
516-
else
517-
printf '%s\n' "$PROMPT_INPUT" | "$GITHUB_WORKSPACE/cagent" "${ARGS[@]}" 2>&1 | tee "$OUTPUT_FILE"
518-
EXIT_CODE=${PIPESTATUS[0]}
519-
fi
520-
set -e
559+
560+
# Max retries exhausted
561+
if [ "$ATTEMPT" -gt "$MAX_RETRIES" ]; then
562+
echo "::warning::Agent failed after $MAX_RETRIES retries (exit code: $EXIT_CODE)"
563+
break
564+
fi
565+
566+
echo "::warning::Agent failed (exit code: $EXIT_CODE), will retry..."
567+
done
521568
522569
END_TIME=$(date +%s)
523570
EXECUTION_TIME=$((END_TIME - START_TIME))

review-pr/README.md

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -236,7 +236,7 @@ When using `docker/cagent-action/.github/workflows/review-pr.yml`:
236236
| `comment-id` | Comment ID for reactions (auto-detected) | - |
237237
| `additional-prompt` | Additional review guidelines | - |
238238
| `model` | Model override (e.g., `anthropic/claude-haiku-4-5`) | - |
239-
| `cagent-version` | cagent version | `v1.23.4` |
239+
| `cagent-version` | cagent version | `v1.23.6` |
240240
| `auto-review-org` | Organization for auto-review membership check | `docker` |
241241

242242
### `review-pr` (Composite Action)

review-pr/action.yml

Lines changed: 83 additions & 42 deletions
Original file line numberDiff line numberDiff line change
@@ -48,7 +48,7 @@ inputs:
4848
cagent-version:
4949
description: "Version of cagent to use"
5050
required: false
51-
default: "v1.23.4"
51+
default: "v1.23.6"
5252
model:
5353
description: "Model to use for reviews (e.g., anthropic/claude-sonnet-4-5, openai/gpt-4o)"
5454
required: false
@@ -64,7 +64,7 @@ outputs:
6464
value: ${{ steps.run-review.outputs.exit-code }}
6565
review-posted:
6666
description: "Whether a review was posted"
67-
value: ${{ steps.run-review.outputs.review-posted }}
67+
value: ${{ steps.verify-review.outputs.review-verified }}
6868
review-url:
6969
description: "URL to the posted review"
7070
value: ${{ steps.post-summary.outputs.review-url }}
@@ -325,12 +325,18 @@ runs:
325325
# ========================================
326326
# RUN REVIEW using root cagent-action
327327
# ========================================
328+
- name: Record pre-review timestamp
329+
id: pre-review
330+
shell: bash
331+
run: echo "timestamp=$(date -u -d '5 seconds ago' +%Y-%m-%dT%H:%M:%SZ 2>/dev/null || date -u -v-5S +%Y-%m-%dT%H:%M:%SZ)" >> $GITHUB_OUTPUT
332+
328333
- name: Run PR Review
329334
id: run-review
330335
uses: docker/cagent-action@latest
331336
with:
332337
agent: ${{ github.action_path }}/agents/pr-review.yaml
333338
prompt: ${{ steps.context.outputs.review_prompt }}
339+
timeout: "1200" # 20 min — prevents GitHub App token expiry (1h limit)
334340
anthropic-api-key: ${{ inputs.anthropic-api-key }}
335341
openai-api-key: ${{ inputs.openai-api-key }}
336342
google-api-key: ${{ inputs.google-api-key }}
@@ -343,45 +349,46 @@ runs:
343349
extra-args: ${{ inputs.model && format('--model={0}', inputs.model) || '' }}
344350
add-prompt-files: ${{ inputs.add-prompt-files }}
345351

346-
- name: Save reviewer memory
347-
if: always()
348-
continue-on-error: true # Don't fail if memory file doesn't exist (first run)
349-
uses: actions/cache/save@1bd1e32a3bdc45362d1e726936510720a7c30a57 # v4.2.0
350-
with:
351-
path: ${{ github.workspace }}/.cache/pr-review-memory.db
352-
key: pr-review-memory-${{ github.repository }}-${{ github.run_id }}
353-
354-
- name: Clean up old memory caches
355-
if: always()
352+
- name: Verify review was posted
353+
id: verify-review
356354
shell: bash
357355
env:
358-
GH_TOKEN: ${{ steps.resolve-token.outputs.token }}
356+
GH_TOKEN: ${{ github.token }}
357+
PR_NUMBER: ${{ steps.resolve-context.outputs.pr-number }}
358+
PRE_REVIEW_TS: ${{ steps.pre-review.outputs.timestamp }}
359+
EXIT_CODE: ${{ steps.run-review.outputs.exit-code }}
359360
run: |
360-
# Keep the 5 most recent caches, delete older ones
361-
# This prevents proliferation while handling concurrent runs safely
362-
CACHE_PREFIX="pr-review-memory-${{ github.repository }}-"
363-
364-
echo "🧹 Cleaning up old memory caches (keeping 5 most recent)"
365-
366-
# Get caches older than the 5 most recent (sorted by created_at descending, skip first 5)
367-
OLD_CACHES=$(gh api "repos/${{ github.repository }}/actions/caches" \
368-
--jq "[.actions_caches | map(select(.key | startswith(\"$CACHE_PREFIX\"))) | sort_by(.created_at) | reverse | .[5:] | .[].id] | .[]" \
369-
2>/dev/null || echo "")
370-
371-
if [ -z "$OLD_CACHES" ]; then
372-
echo "✅ No old caches to clean up"
361+
# If the agent failed, no review was posted
362+
if [ "$EXIT_CODE" != "0" ]; then
363+
echo "review-verified=false" >> $GITHUB_OUTPUT
373364
exit 0
374365
fi
375366
376-
# Delete each old cache
377-
DELETED=0
378-
for CACHE_ID in $OLD_CACHES; do
379-
if gh api "repos/${{ github.repository }}/actions/caches/$CACHE_ID" -X DELETE 2>/dev/null; then
380-
((DELETED++)) || true
381-
fi
382-
done
367+
# Check for a bot review submitted AFTER this run started
368+
API_ERR=$(mktemp)
369+
REVIEW_COUNT=$(gh api "repos/${{ github.repository }}/pulls/$PR_NUMBER/reviews" \
370+
--jq --arg ts "$PRE_REVIEW_TS" \
371+
'[.[] | select(.user.type == "Bot" and .submitted_at >= $ts)] | length' \
372+
2>"$API_ERR" || echo "0")
373+
if [ -s "$API_ERR" ]; then
374+
echo "::warning::Review verification API error: $(cat "$API_ERR")"
375+
fi
376+
rm -f "$API_ERR"
383377
384-
echo "✅ Deleted $DELETED old cache(s)"
378+
if [ "$REVIEW_COUNT" -eq 0 ]; then
379+
echo "::warning::Review agent exited successfully but no review was found on the PR. The GitHub token may have expired during execution."
380+
echo "review-verified=false" >> $GITHUB_OUTPUT
381+
else
382+
echo "review-verified=true" >> $GITHUB_OUTPUT
383+
fi
384+
385+
- name: Save reviewer memory
386+
if: always()
387+
continue-on-error: true # Don't fail if memory file doesn't exist (first run)
388+
uses: actions/cache/save@1bd1e32a3bdc45362d1e726936510720a7c30a57 # v4.2.0
389+
with:
390+
path: ${{ github.workspace }}/.cache/pr-review-memory.db
391+
key: pr-review-memory-${{ github.repository }}-${{ github.run_id }}
385392

386393
# ========================================
387394
# POST-REVIEW: Clean summary & reactions
@@ -391,25 +398,43 @@ runs:
391398
if: always()
392399
shell: bash
393400
env:
401+
GH_TOKEN: ${{ github.token }}
394402
PR_NUMBER: ${{ steps.resolve-context.outputs.pr-number }}
395403
REPOSITORY: ${{ github.repository }}
396404
EXIT_CODE: ${{ steps.run-review.outputs.exit-code }}
405+
REVIEW_VERIFIED: ${{ steps.verify-review.outputs.review-verified }}
406+
RUN_URL: ${{ github.server_url }}/${{ github.repository }}/actions/runs/${{ github.run_id }}
397407
run: |
398408
REVIEW_URL="https://github.com/$REPOSITORY/pull/$PR_NUMBER"
399409
echo "review-url=$REVIEW_URL" >> $GITHUB_OUTPUT
400410
411+
# Determine status and post fallback comment if review wasn't posted
412+
if [ "$EXIT_CODE" != "0" ]; then
413+
STATUS="❌ **Review failed** (exit code: $EXIT_CODE)"
414+
if ! gh api "repos/$REPOSITORY/issues/$PR_NUMBER/comments" \
415+
-f body="❌ **PR Review Failed** — The review agent encountered an error and could not complete the review. [View logs]($RUN_URL)." \
416+
2>&1; then
417+
echo "::warning::Failed to post fallback comment to PR"
418+
fi
419+
elif [ "$REVIEW_VERIFIED" == "false" ]; then
420+
STATUS="⚠️ **Review not posted** — agent completed but review was not found on the PR (possible token expiry)"
421+
if ! gh api "repos/$REPOSITORY/issues/$PR_NUMBER/comments" \
422+
-f body="⚠️ **PR Review Incomplete** — The review agent completed analysis but was unable to post the review due to an authentication timeout. [View logs]($RUN_URL)." \
423+
2>&1; then
424+
echo "::warning::Failed to post fallback comment to PR"
425+
fi
426+
else
427+
STATUS="✅ **Review posted successfully**"
428+
fi
429+
401430
# Override the default summary with a cleaner one for PR reviews
402431
{
403432
echo ""
404433
echo "---"
405434
echo ""
406435
echo "## PR Review Summary"
407436
echo ""
408-
if [ "$EXIT_CODE" == "0" ]; then
409-
echo "✅ **Review posted successfully**"
410-
else
411-
echo "❌ **Review failed** (exit code: $EXIT_CODE)"
412-
fi
437+
echo "$STATUS"
413438
echo ""
414439
echo "📝 [View Pull Request #$PR_NUMBER]($REVIEW_URL)"
415440
} >> $GITHUB_STEP_SUMMARY
@@ -418,9 +443,11 @@ runs:
418443
if: steps.resolve-context.outputs.comment-id != '' && always()
419444
shell: bash
420445
env:
421-
GH_TOKEN: ${{ steps.resolve-token.outputs.token }}
446+
GH_TOKEN: ${{ github.token }} # Use github.token (6h lifetime) — App token may have expired
422447
EXIT_CODE: ${{ steps.run-review.outputs.exit-code }}
448+
REVIEW_VERIFIED: ${{ steps.verify-review.outputs.review-verified }}
423449
PR_NUMBER: ${{ steps.resolve-context.outputs.pr-number }}
450+
PRE_REVIEW_TS: ${{ steps.pre-review.outputs.timestamp }}
424451
run: |
425452
if [ "$EXIT_CODE" != "0" ]; then
426453
# Error: add confused reaction
@@ -429,9 +456,23 @@ runs:
429456
exit 0
430457
fi
431458
432-
# Get the latest review state from the PR (use last? to handle empty arrays safely)
459+
if [ "$REVIEW_VERIFIED" == "false" ]; then
460+
# Agent succeeded but review wasn't posted (likely token expiry)
461+
gh api "repos/${{ github.repository }}/issues/comments/${{ steps.resolve-context.outputs.comment-id }}/reactions" \
462+
-X POST -f content='confused' || true
463+
exit 0
464+
fi
465+
466+
# Get the state of the review posted during THIS run
467+
API_ERR=$(mktemp)
433468
REVIEW_STATE=$(gh api "repos/${{ github.repository }}/pulls/$PR_NUMBER/reviews" \
434-
--jq '[.[] | select(.user.type == "Bot")] | last? | .state? // empty' 2>/dev/null || echo "")
469+
--jq --arg ts "$PRE_REVIEW_TS" \
470+
'[.[] | select(.user.type == "Bot" and .submitted_at >= $ts)] | last | .state // empty' \
471+
2>"$API_ERR" || echo "")
472+
if [ -s "$API_ERR" ]; then
473+
echo "::warning::Review state API error: $(cat "$API_ERR")"
474+
fi
475+
rm -f "$API_ERR"
435476
436477
if [ "$REVIEW_STATE" == "APPROVED" ]; then
437478
# Approved: thumbs up

tests/test-job-summary.sh

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -35,7 +35,7 @@ echo "---"
3535
echo "| Agent | \`agents/security-scanner.yaml\` |"
3636
echo "| Exit Code | 0 |"
3737
echo "| Execution Time | 45s |"
38-
echo "| cagent Version | v1.23.4 |"
38+
echo "| cagent Version | v1.23.6 |"
3939
echo "| MCP Gateway | false |"
4040
echo ""
4141
echo "✅ **Status:** Success"

0 commit comments

Comments
 (0)