Skip to content

Summarize CI Failures #2963

Summarize CI Failures

Summarize CI Failures #2963

name: Summarize CI Failures
on:
# Automatically trigger when Prow job status is reported
status:
# Allow manual re-analysis of any failed job
workflow_dispatch:
inputs:
prow_url:
description: 'Prow job URL to analyze (e.g., https://prow.ci.openshift.org/view/gs/test-platform-results/pr-logs/pull/...)'
required: true
type: string
comment_on_pr:
description: '(Optional) Override PR number for comment - for testing only'
required: false
type: string
permissions:
contents: read
pull-requests: write
statuses: read
jobs:
summarize-failure:
# Run for:
# 1. Manual dispatch (workflow_dispatch), OR
# 2. Status events that are failures from Prow nvidia-gpu-operator jobs
# Note: We use startsWith to match Prow CI job context format (ci/prow/...)
# This is more secure than contains() as it prevents context spoofing
if: |
github.event_name == 'workflow_dispatch' ||
(github.event_name == 'status' &&
github.event.state == 'failure' &&
startsWith(github.event.context, 'ci/prow/') &&
contains(github.event.context, 'nvidia-gpu-operator'))
runs-on: ubuntu-latest
timeout-minutes: 15 # Allow time for model download + slow CPU inference
steps:
- name: Checkout repository
uses: actions/checkout@v4
- name: Setup Python
uses: actions/setup-python@v5
with:
python-version: '3.13'
cache: 'pip'
cache-dependency-path: .github/scripts/ci_failure_summarizer/requirements.txt
- name: Install Python dependencies
run: |
pip install -r .github/scripts/ci_failure_summarizer/requirements.txt
# Cache Ollama binary (~100MB) and models (~1GB for llama3.2:1b)
- name: Cache Ollama binary
id: cache-ollama-bin
uses: actions/cache@v4
with:
path: /usr/local/bin/ollama
key: ollama-bin-v0.13.5
- name: Cache Ollama models
id: cache-ollama-models
uses: actions/cache@v4
with:
path: ~/.ollama
key: ollama-models-llama3.2-1b-v1
- name: Install Ollama
if: steps.cache-ollama-bin.outputs.cache-hit != 'true'
run: |
# Install pinned Ollama version with checksum verification for security
# Avoid piping remote scripts to shell - download and verify instead
OLLAMA_VERSION="v0.13.5"
echo "Downloading Ollama ${OLLAMA_VERSION}..."
curl -fsSL -o ollama-linux-amd64.tgz \
"https://github.com/ollama/ollama/releases/download/${OLLAMA_VERSION}/ollama-linux-amd64.tgz"
curl -fsSL -o sha256sum.txt \
"https://github.com/ollama/ollama/releases/download/${OLLAMA_VERSION}/sha256sum.txt"
echo "Verifying checksum..."
# Extract just the line for our file and verify
grep "ollama-linux-amd64.tgz" sha256sum.txt | sed 's|./||' > ollama.sha256
sha256sum -c ollama.sha256
echo "Extracting and installing Ollama..."
tar -xzf ollama-linux-amd64.tgz
sudo install -m 755 bin/ollama /usr/local/bin/ollama
# Clean up
rm -rf ollama-linux-amd64.tgz sha256sum.txt ollama.sha256 bin/ lib/
# Verify installation
ollama --version
- name: Verify Ollama installation
if: steps.cache-ollama-bin.outputs.cache-hit == 'true'
run: |
echo "Using cached Ollama binary"
ollama --version
- name: Pull Ollama model
run: |
# Start Ollama server temporarily to pull model
ollama serve &
OLLAMA_PID=$!
# Wait for server to be ready
SERVER_READY=false
for i in {1..30}; do
if curl -s http://localhost:11434/api/tags > /dev/null 2>&1; then
echo "Ollama server is ready"
SERVER_READY=true
break
fi
echo "Waiting for Ollama server... ($i/30)"
sleep 1
done
if [ "$SERVER_READY" != "true" ]; then
echo "::error::Ollama server failed to start"
exit 1
fi
# Pull model (will be fast if cached)
echo "Pulling llama3.2:1b model..."
ollama pull llama3.2:1b
# Verify model is available
ollama list
# Stop server (will restart in summarize step)
kill $OLLAMA_PID 2>/dev/null || true
- name: Parse job info
id: parse
env:
EVENT_NAME: ${{ github.event_name }}
INPUT_PROW_URL: ${{ inputs.prow_url }}
STATUS_TARGET_URL: ${{ github.event.target_url }}
STATUS_CONTEXT: ${{ github.event.context }}
run: |
python3 << 'EOF'
import re
import os
import uuid
def set_output(name, value):
"""Write output using multiline delimiter to prevent injection."""
# Sanitize: strip newlines and control chars from value
safe_value = ''.join(c for c in str(value) if c.isprintable() or c == ' ')
delimiter = f"ghadelimiter_{uuid.uuid4().hex}"
with open(os.environ["GITHUB_OUTPUT"], "a") as f:
f.write(f"{name}<<{delimiter}\n{safe_value}\n{delimiter}\n")
event_name = os.environ.get("EVENT_NAME", "")
# Get URL from either manual input or status event
if event_name == "workflow_dispatch":
url = os.environ.get("INPUT_PROW_URL", "")
print(f"Manual trigger with URL: {url}")
else:
url = os.environ.get("STATUS_TARGET_URL", "")
context = os.environ.get("STATUS_CONTEXT", "")
print(f"Status event - Context: {context}")
print(f"Status event - URL: {url}")
# Parse: .../pr-logs/pull/org_repo/PR/job-name/build-id
match = re.search(r'/pr-logs/pull/([^/]+)/(\d+)/([^/]+)/(\d+)', url)
# Expected org_repo for this workflow (must match to prevent cross-repo issues)
EXPECTED_ORG_REPO = "rh-ecosystem-edge_nvidia-ci"
if match:
org_repo, pr_number, job_name, build_id = match.groups()
print(f"Parsed: org_repo={org_repo}, PR={pr_number}, Job={job_name}, Build={build_id}")
# Validate org_repo matches this repository to prevent commenting on wrong PRs
if org_repo != EXPECTED_ORG_REPO:
print(f"::error::Prow URL is for '{org_repo}', expected '{EXPECTED_ORG_REPO}'")
print("This workflow only supports PRs from the nvidia-ci repository")
set_output("parsed", "false")
exit(1)
set_output("org_repo", org_repo)
set_output("pr_number", pr_number)
set_output("job_name", job_name)
set_output("build_id", build_id)
set_output("prow_url", url)
set_output("parsed", "true")
else:
print(f"::error::Could not parse Prow URL: {url}")
set_output("parsed", "false")
exit(1)
EOF
- name: Generate AI summary
id: summarize
env:
PROW_URL: ${{ steps.parse.outputs.prow_url }}
run: |
# Start Ollama server (model is already cached)
echo "Starting Ollama server..."
ollama serve &
# Wait for server to be ready
for i in {1..30}; do
if curl -s http://localhost:11434/api/tags > /dev/null 2>&1; then
echo "Ollama server is ready"
break
fi
echo "Waiting for Ollama server... ($i/30)"
sleep 1
done
# Verify server is running
if ! curl -s http://localhost:11434/api/tags > /dev/null 2>&1; then
echo "::error::Ollama server failed to start"
exit 1
fi
# Debug: List available models
echo "Available models:"
ollama list
# Debug: Check if model is loaded
echo "Testing model availability..."
curl -s http://localhost:11434/api/tags | head -c 500
echo ""
# Run the summarizer
PYTHONPATH=.github/scripts python -m ci_failure_summarizer.summarize
- name: Post PR comment
if: steps.summarize.outputs.summary != ''
uses: actions/github-script@v7
with:
script: |
const summary = process.env.SUMMARY;
const prNumber = process.env.PR_NUMBER;
if (!prNumber) {
console.log('No PR number available, skipping comment');
return;
}
console.log(`Posting summary to PR #${prNumber}`);
try {
await github.rest.issues.createComment({
owner: context.repo.owner,
repo: context.repo.repo,
issue_number: parseInt(prNumber),
body: summary
});
console.log('Comment posted successfully');
} catch (error) {
console.log(`::warning::Failed to post comment to PR #${prNumber}: ${error.message}`);
console.log('This may happen if the PR is closed or permissions are insufficient');
}
env:
SUMMARY: ${{ steps.summarize.outputs.summary }}
PR_NUMBER: ${{ inputs.comment_on_pr || steps.summarize.outputs.pr_number || steps.parse.outputs.pr_number }}
- name: Report error
if: failure()
env:
SUMMARIZE_ERROR: ${{ steps.summarize.outputs.error }}
run: |
if [ -n "$SUMMARIZE_ERROR" ]; then
echo "::error::$SUMMARIZE_ERROR"
else
echo "::error::Workflow failed - check logs for details"
fi