add retries and timeouts (#51)

derekmisler · web-flow · commit 509e7216a9fc · 2026-02-23T17:07:42.000-05:00
Signed-off-by: Derek Misler &lt;derek.misler@docker.com&gt;
diff --git a/.github/workflows/release.yml b/.github/workflows/release.yml
@@ -52,7 +52,7 @@ jobs:
 
       - name: Install cagent
         env:
-          CAGENT_VERSION: "v1.23.4"
+          CAGENT_VERSION: "v1.23.6"
         run: |
           set -e
           curl -fL -o cagent \
diff --git a/.github/workflows/review-pr.yml b/.github/workflows/review-pr.yml
@@ -53,7 +53,7 @@ on:
         description: "Version of cagent to use"
         required: false
         type: string
-        default: "v1.23.4"
+        default: "v1.23.6"
       add-prompt-files:
         description: "Comma-separated list of files to append to the prompt (e.g., 'AGENTS.md,CLAUDE.md')"
         required: false
diff --git a/README.md b/README.md
@@ -98,7 +98,7 @@ See the [full PR Review documentation](review-pr/README.md) for more details.
     agent: docker/code-analyzer
     prompt: "Analyze this codebase"
     anthropic-api-key: ${{ secrets.ANTHROPIC_API_KEY }}
-    cagent-version: v1.23.4
+    cagent-version: v1.23.6
     mcp-gateway: true # Set to true to install mcp-gateway
     mcp-gateway-version: v0.22.0
     yolo: false # Require manual approval
@@ -142,7 +142,7 @@ See the [full PR Review documentation](review-pr/README.md) for more details.
 | --------------------- | ------------------------------------------------------------------------------------ | -------- | ------------------------------- |
 | `agent`               | Agent identifier (e.g., `docker/code-analyzer`) or path to `.yaml` file              | Yes      | -                               |
 | `prompt`              | Prompt to pass to the agent                                                          | No       | -                               |
-| `cagent-version`      | Version of cagent to use                                                             | No       | `v1.23.4`                       |
+| `cagent-version`      | Version of cagent to use                                                             | No       | `v1.23.6`                       |
 | `mcp-gateway`         | Install mcp-gateway (`true`/`false`)                                                 | No       | `false`                         |
 | `mcp-gateway-version` | Version of mcp-gateway to use (specifying this will enable mcp-gateway installation) | No       | `v0.22.0`                       |
 | `anthropic-api-key`   | Anthropic API key for Claude models (at least one API key required)                  | No*      | -                                   |
@@ -161,7 +161,7 @@ See the [full PR Review documentation](review-pr/README.md) for more details.
 | `yolo`                | Auto-approve all prompts (`true`/`false`)                                            | No       | `true`                          |
 | `quiet`               | Suppress verbose tool call output (`true`/`false`). Set to `false` for debugging.    | No       | `true`                          |
 | `max-retries`         | Maximum number of retries on failure (0 = no retries)                                | No       | `2`                             |
-| `retry-delay`         | Seconds to wait between retries                                                      | No       | `5`                             |
+| `retry-delay`         | Base delay in seconds between retries (doubles each attempt)                          | No       | `5`                             |
 | `extra-args`          | Additional arguments to pass to `cagent run`                                         | No       | -                               |
 | `add-prompt-files`    | Comma-separated list of files to append to the prompt (e.g., `AGENTS.md,CLAUDE.md`)  | No       | -                               |
 
diff --git a/action.yml b/action.yml
@@ -15,7 +15,7 @@ inputs:
   cagent-version:
     description: "Version of cagent to use"
     required: false
-    default: "v1.23.4"
+    default: "v1.23.6"
   mcp-gateway:
     description: "Install mcp-gateway (true/false)"
     required: false
@@ -74,6 +74,14 @@ inputs:
     description: "Suppress verbose tool call output (true/false). Set to false for debugging."
     required: false
     default: "true"
+  max-retries:
+    description: "Maximum number of retries on failure (0 = no retries)"
+    required: false
+    default: "2"
+  retry-delay:
+    description: "Base delay in seconds between retries (doubles each attempt)"
+    required: false
+    default: "5"
   extra-args:
     description: "Additional arguments to pass to cagent run"
     required: false
@@ -426,6 +434,8 @@ runs:
         MCP_INSTALLED: ${{ steps.setup-binaries.outputs.mcp-installed }}
         QUIET: ${{ inputs.quiet }}
         ADD_PROMPT_FILES: ${{ inputs.add-prompt-files }}
+        MAX_RETRIES: ${{ inputs.max-retries }}
+        RETRY_DELAY: ${{ inputs.retry-delay }}
         TELEMETRY_TAGS: "source=github-actions,repo=${{ github.repository }},workflow=${{ github.workflow }},run_id=${{ github.run_id }}"
       run: |
         set -e
@@ -505,19 +515,56 @@ runs:
         # Track execution time
         START_TIME=$(date +%s)
 
-        # SECURE: Direct execution with quoted arguments (no eval!)
-        set +e  # Don't exit on command failure
-        if [ "$TIMEOUT" != "0" ]; then
-          printf '%s\n' "$PROMPT_INPUT" | timeout "$TIMEOUT" "$GITHUB_WORKSPACE/cagent" "${ARGS[@]}" 2>&1 | tee "$OUTPUT_FILE"
-          EXIT_CODE=$?
+        # Retry loop with exponential backoff
+        ATTEMPT=0
+        CURRENT_DELAY="$RETRY_DELAY"
+        EXIT_CODE=1
+
+        while true; do
+          ATTEMPT=$((ATTEMPT + 1))
+
+          if [ "$ATTEMPT" -gt 1 ]; then
+            echo "🔄 Retry attempt $((ATTEMPT - 1)) of $MAX_RETRIES (waiting ${CURRENT_DELAY}s)..."
+            sleep "$CURRENT_DELAY"
+            CURRENT_DELAY=$((CURRENT_DELAY * 2))
+            # Reset output file for retry
+            > "$OUTPUT_FILE"
+          fi
+
+          # SECURE: Direct execution with quoted arguments (no eval!)
+          set +e  # Don't exit on command failure
+          # Pipeline: printf | [timeout] cagent | tee
+          # PIPESTATUS: [0]=printf  [1]=cagent/timeout  [2]=tee
+          if [ "$TIMEOUT" != "0" ]; then
+            printf '%s\n' "$PROMPT_INPUT" | timeout "$TIMEOUT" "$GITHUB_WORKSPACE/cagent" "${ARGS[@]}" 2>&1 | tee "$OUTPUT_FILE"
+            EXIT_CODE=${PIPESTATUS[1]}
+            if [ $EXIT_CODE -eq 124 ]; then
+              echo "::error::Agent execution timed out after $TIMEOUT seconds"
+            fi
+          else
+            printf '%s\n' "$PROMPT_INPUT" | "$GITHUB_WORKSPACE/cagent" "${ARGS[@]}" 2>&1 | tee "$OUTPUT_FILE"
+            EXIT_CODE=${PIPESTATUS[1]}
+          fi
+          set -e
+
+          # Success — no retry needed
+          if [ $EXIT_CODE -eq 0 ]; then
+            break
+          fi
+
+          # Timeout (124) — don't retry, would just timeout again
           if [ $EXIT_CODE -eq 124 ]; then
-            echo "::error::Agent execution timed out after $TIMEOUT seconds"
+            break
           fi
-        else
-          printf '%s\n' "$PROMPT_INPUT" | "$GITHUB_WORKSPACE/cagent" "${ARGS[@]}" 2>&1 | tee "$OUTPUT_FILE"
-          EXIT_CODE=${PIPESTATUS[0]}
-        fi
-        set -e
+
+          # Max retries exhausted
+          if [ "$ATTEMPT" -gt "$MAX_RETRIES" ]; then
+            echo "::warning::Agent failed after $MAX_RETRIES retries (exit code: $EXIT_CODE)"
+            break
+          fi
+
+          echo "::warning::Agent failed (exit code: $EXIT_CODE), will retry..."
+        done
 
         END_TIME=$(date +%s)
         EXECUTION_TIME=$((END_TIME - START_TIME))
diff --git a/review-pr/README.md b/review-pr/README.md
@@ -236,7 +236,7 @@ When using `docker/cagent-action/.github/workflows/review-pr.yml`:
 | `comment-id`        | Comment ID for reactions (auto-detected)            | -         |
 | `additional-prompt` | Additional review guidelines                        | -         |
 | `model`             | Model override (e.g., `anthropic/claude-haiku-4-5`) | -         |
-| `cagent-version`    | cagent version                                      | `v1.23.4` |
+| `cagent-version`    | cagent version                                      | `v1.23.6` |
 | `auto-review-org`   | Organization for auto-review membership check       | `docker`  |
 
 ### `review-pr` (Composite Action)
diff --git a/review-pr/action.yml b/review-pr/action.yml
@@ -48,7 +48,7 @@ inputs:
   cagent-version:
     description: "Version of cagent to use"
     required: false
-    default: "v1.23.4"
+    default: "v1.23.6"
   model:
     description: "Model to use for reviews (e.g., anthropic/claude-sonnet-4-5, openai/gpt-4o)"
     required: false
@@ -64,7 +64,7 @@ outputs:
     value: ${{ steps.run-review.outputs.exit-code }}
   review-posted:
     description: "Whether a review was posted"
-    value: ${{ steps.run-review.outputs.review-posted }}
+    value: ${{ steps.verify-review.outputs.review-verified }}
   review-url:
     description: "URL to the posted review"
     value: ${{ steps.post-summary.outputs.review-url }}
@@ -325,12 +325,18 @@ runs:
     # ========================================
     # RUN REVIEW using root cagent-action
     # ========================================
+    - name: Record pre-review timestamp
+      id: pre-review
+      shell: bash
+      run: echo "timestamp=$(date -u -d '5 seconds ago' +%Y-%m-%dT%H:%M:%SZ 2>/dev/null || date -u -v-5S +%Y-%m-%dT%H:%M:%SZ)" >> $GITHUB_OUTPUT
+
     - name: Run PR Review
       id: run-review
       uses: docker/cagent-action@latest
       with:
         agent: ${{ github.action_path }}/agents/pr-review.yaml
         prompt: ${{ steps.context.outputs.review_prompt }}
+        timeout: "1200"  # 20 min — prevents GitHub App token expiry (1h limit)
         anthropic-api-key: ${{ inputs.anthropic-api-key }}
         openai-api-key: ${{ inputs.openai-api-key }}
         google-api-key: ${{ inputs.google-api-key }}
@@ -343,45 +349,46 @@ runs:
         extra-args: ${{ inputs.model && format('--model={0}', inputs.model) || '' }}
         add-prompt-files: ${{ inputs.add-prompt-files }}
 
-    - name: Save reviewer memory
-      if: always()
-      continue-on-error: true  # Don't fail if memory file doesn't exist (first run)
-      uses: actions/cache/save@1bd1e32a3bdc45362d1e726936510720a7c30a57 # v4.2.0
-      with:
-        path: ${{ github.workspace }}/.cache/pr-review-memory.db
-        key: pr-review-memory-${{ github.repository }}-${{ github.run_id }}
-
-    - name: Clean up old memory caches
-      if: always()
+    - name: Verify review was posted
+      id: verify-review
       shell: bash
       env:
-        GH_TOKEN: ${{ steps.resolve-token.outputs.token }}
+        GH_TOKEN: ${{ github.token }}
+        PR_NUMBER: ${{ steps.resolve-context.outputs.pr-number }}
+        PRE_REVIEW_TS: ${{ steps.pre-review.outputs.timestamp }}
+        EXIT_CODE: ${{ steps.run-review.outputs.exit-code }}
       run: |
-        # Keep the 5 most recent caches, delete older ones
-        # This prevents proliferation while handling concurrent runs safely
-        CACHE_PREFIX="pr-review-memory-${{ github.repository }}-"
-
-        echo "🧹 Cleaning up old memory caches (keeping 5 most recent)"
-
-        # Get caches older than the 5 most recent (sorted by created_at descending, skip first 5)
-        OLD_CACHES=$(gh api "repos/${{ github.repository }}/actions/caches" \
-          --jq "[.actions_caches | map(select(.key | startswith(\"$CACHE_PREFIX\"))) | sort_by(.created_at) | reverse | .[5:] | .[].id] | .[]" \
-          2>/dev/null || echo "")
-
-        if [ -z "$OLD_CACHES" ]; then
-          echo "✅ No old caches to clean up"
+        # If the agent failed, no review was posted
+        if [ "$EXIT_CODE" != "0" ]; then
+          echo "review-verified=false" >> $GITHUB_OUTPUT
           exit 0
         fi
 
-        # Delete each old cache
-        DELETED=0
-        for CACHE_ID in $OLD_CACHES; do
-          if gh api "repos/${{ github.repository }}/actions/caches/$CACHE_ID" -X DELETE 2>/dev/null; then
-            ((DELETED++)) || true
-          fi
-        done
+        # Check for a bot review submitted AFTER this run started
+        API_ERR=$(mktemp)
+        REVIEW_COUNT=$(gh api "repos/${{ github.repository }}/pulls/$PR_NUMBER/reviews" \
+          --jq --arg ts "$PRE_REVIEW_TS" \
+          '[.[] | select(.user.type == "Bot" and .submitted_at >= $ts)] | length' \
+          2>"$API_ERR" || echo "0")
+        if [ -s "$API_ERR" ]; then
+          echo "::warning::Review verification API error: $(cat "$API_ERR")"
+        fi
+        rm -f "$API_ERR"
 
-        echo "✅ Deleted $DELETED old cache(s)"
+        if [ "$REVIEW_COUNT" -eq 0 ]; then
+          echo "::warning::Review agent exited successfully but no review was found on the PR. The GitHub token may have expired during execution."
+          echo "review-verified=false" >> $GITHUB_OUTPUT
+        else
+          echo "review-verified=true" >> $GITHUB_OUTPUT
+        fi
+
+    - name: Save reviewer memory
+      if: always()
+      continue-on-error: true  # Don't fail if memory file doesn't exist (first run)
+      uses: actions/cache/save@1bd1e32a3bdc45362d1e726936510720a7c30a57 # v4.2.0
+      with:
+        path: ${{ github.workspace }}/.cache/pr-review-memory.db
+        key: pr-review-memory-${{ github.repository }}-${{ github.run_id }}
 
     # ========================================
     # POST-REVIEW: Clean summary & reactions
@@ -391,25 +398,43 @@ runs:
       if: always()
       shell: bash
       env:
+        GH_TOKEN: ${{ github.token }}
         PR_NUMBER: ${{ steps.resolve-context.outputs.pr-number }}
         REPOSITORY: ${{ github.repository }}
         EXIT_CODE: ${{ steps.run-review.outputs.exit-code }}
+        REVIEW_VERIFIED: ${{ steps.verify-review.outputs.review-verified }}
+        RUN_URL: ${{ github.server_url }}/${{ github.repository }}/actions/runs/${{ github.run_id }}
       run: |
         REVIEW_URL="https://github.com/$REPOSITORY/pull/$PR_NUMBER"
         echo "review-url=$REVIEW_URL" >> $GITHUB_OUTPUT
 
+        # Determine status and post fallback comment if review wasn't posted
+        if [ "$EXIT_CODE" != "0" ]; then
+          STATUS="❌ **Review failed** (exit code: $EXIT_CODE)"
+          if ! gh api "repos/$REPOSITORY/issues/$PR_NUMBER/comments" \
+            -f body="❌ **PR Review Failed** — The review agent encountered an error and could not complete the review. [View logs]($RUN_URL)." \
+            2>&1; then
+            echo "::warning::Failed to post fallback comment to PR"
+          fi
+        elif [ "$REVIEW_VERIFIED" == "false" ]; then
+          STATUS="⚠️ **Review not posted** — agent completed but review was not found on the PR (possible token expiry)"
+          if ! gh api "repos/$REPOSITORY/issues/$PR_NUMBER/comments" \
+            -f body="⚠️ **PR Review Incomplete** — The review agent completed analysis but was unable to post the review due to an authentication timeout. [View logs]($RUN_URL)." \
+            2>&1; then
+            echo "::warning::Failed to post fallback comment to PR"
+          fi
+        else
+          STATUS="✅ **Review posted successfully**"
+        fi
+
         # Override the default summary with a cleaner one for PR reviews
         {
           echo ""
           echo "---"
           echo ""
           echo "## PR Review Summary"
           echo ""
-          if [ "$EXIT_CODE" == "0" ]; then
-            echo "✅ **Review posted successfully**"
-          else
-            echo "❌ **Review failed** (exit code: $EXIT_CODE)"
-          fi
+          echo "$STATUS"
           echo ""
           echo "📝 [View Pull Request #$PR_NUMBER]($REVIEW_URL)"
         } >> $GITHUB_STEP_SUMMARY
@@ -418,9 +443,11 @@ runs:
       if: steps.resolve-context.outputs.comment-id != '' && always()
       shell: bash
       env:
-        GH_TOKEN: ${{ steps.resolve-token.outputs.token }}
+        GH_TOKEN: ${{ github.token }}  # Use github.token (6h lifetime) — App token may have expired
         EXIT_CODE: ${{ steps.run-review.outputs.exit-code }}
+        REVIEW_VERIFIED: ${{ steps.verify-review.outputs.review-verified }}
         PR_NUMBER: ${{ steps.resolve-context.outputs.pr-number }}
+        PRE_REVIEW_TS: ${{ steps.pre-review.outputs.timestamp }}
       run: |
         if [ "$EXIT_CODE" != "0" ]; then
           # Error: add confused reaction
@@ -429,9 +456,23 @@ runs:
           exit 0
         fi
 
-        # Get the latest review state from the PR (use last? to handle empty arrays safely)
+        if [ "$REVIEW_VERIFIED" == "false" ]; then
+          # Agent succeeded but review wasn't posted (likely token expiry)
+          gh api "repos/${{ github.repository }}/issues/comments/${{ steps.resolve-context.outputs.comment-id }}/reactions" \
+            -X POST -f content='confused' || true
+          exit 0
+        fi
+
+        # Get the state of the review posted during THIS run
+        API_ERR=$(mktemp)
         REVIEW_STATE=$(gh api "repos/${{ github.repository }}/pulls/$PR_NUMBER/reviews" \
-          --jq '[.[] | select(.user.type == "Bot")] | last? | .state? // empty' 2>/dev/null || echo "")
+          --jq --arg ts "$PRE_REVIEW_TS" \
+          '[.[] | select(.user.type == "Bot" and .submitted_at >= $ts)] | last | .state // empty' \
+          2>"$API_ERR" || echo "")
+        if [ -s "$API_ERR" ]; then
+          echo "::warning::Review state API error: $(cat "$API_ERR")"
+        fi
+        rm -f "$API_ERR"
 
         if [ "$REVIEW_STATE" == "APPROVED" ]; then
           # Approved: thumbs up
diff --git a/tests/test-job-summary.sh b/tests/test-job-summary.sh
@@ -35,7 +35,7 @@ echo "---"
   echo "| Agent | \`agents/security-scanner.yaml\` |"
   echo "| Exit Code | 0 |"
   echo "| Execution Time | 45s |"
-  echo "| cagent Version | v1.23.4 |"
+  echo "| cagent Version | v1.23.6 |"
   echo "| MCP Gateway | false |"
   echo ""
   echo "✅ **Status:** Success"