Uptime Monitoring #4479
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| name: Uptime Monitoring | |
| on: | |
| schedule: | |
| # Run every 15 minutes | |
| - cron: '*/15 * * * *' | |
| workflow_dispatch: # Allow manual trigger | |
| permissions: | |
| issues: write | |
| contents: read | |
| jobs: | |
| check-website: | |
| name: Check lumos-lang.org | |
| runs-on: ubuntu-latest | |
| steps: | |
| - name: Check website availability | |
| id: check | |
| run: | | |
| echo "Checking lumos-lang.org..." | |
| # Check with detailed output | |
| HTTP_CODE=$(curl -o /dev/null -s -w "%{http_code}" -L --max-time 30 https://lumos-lang.org) | |
| echo "http_code=$HTTP_CODE" >> $GITHUB_OUTPUT | |
| echo "timestamp=$(date -u +"%Y-%m-%d %H:%M:%S UTC")" >> $GITHUB_OUTPUT | |
| if [ "$HTTP_CODE" -eq 200 ]; then | |
| echo "✅ Website is UP (HTTP $HTTP_CODE)" | |
| exit 0 | |
| else | |
| echo "❌ Website is DOWN or unreachable (HTTP $HTTP_CODE)" | |
| exit 1 | |
| fi | |
| - name: Additional diagnostics (on failure) | |
| if: failure() | |
| run: | | |
| echo "Running additional diagnostics..." | |
| echo "=== DNS Resolution ===" | |
| nslookup lumos-lang.org || true | |
| echo "=== Ping Test ===" | |
| ping -c 3 lumos-lang.org || true | |
| echo "=== curl verbose ===" | |
| curl -v --max-time 30 https://lumos-lang.org || true | |
| - name: Create issue on failure | |
| if: failure() | |
| uses: actions/github-script@v8 | |
| with: | |
| script: | | |
| const httpCode = '${{ steps.check.outputs.http_code }}'; | |
| const timestamp = '${{ steps.check.outputs.timestamp }}'; | |
| // Check if an open issue already exists | |
| const existingIssues = await github.rest.issues.listForRepo({ | |
| owner: context.repo.owner, | |
| repo: context.repo.repo, | |
| state: 'open', | |
| labels: 'uptime-alert', | |
| per_page: 1 | |
| }); | |
| if (existingIssues.data.length > 0) { | |
| // Update existing issue with new failure | |
| const issue = existingIssues.data[0]; | |
| await github.rest.issues.createComment({ | |
| owner: context.repo.owner, | |
| repo: context.repo.repo, | |
| issue_number: issue.number, | |
| body: `🔴 **Another downtime detected**\n\n` + | |
| `- **Time**: ${timestamp}\n` + | |
| `- **HTTP Code**: ${httpCode}\n` + | |
| `- **Workflow Run**: ${context.serverUrl}/${context.repo.owner}/${context.repo.repo}/actions/runs/${context.runId}` | |
| }); | |
| console.log(`Updated existing issue #${issue.number}`); | |
| } else { | |
| // Create new issue | |
| await github.rest.issues.create({ | |
| owner: context.repo.owner, | |
| repo: context.repo.repo, | |
| title: '🚨 lumos-lang.org is DOWN', | |
| labels: ['uptime-alert', 'priority:critical', 'area:infrastructure'], | |
| body: `## Website Downtime Detected\n\n` + | |
| `Automated monitoring detected that **lumos-lang.org** is unreachable.\n\n` + | |
| `### Details\n\n` + | |
| `- **Time**: ${timestamp}\n` + | |
| `- **HTTP Code**: ${httpCode}\n` + | |
| `- **Expected**: 200 OK\n` + | |
| `- **Workflow Run**: ${context.serverUrl}/${context.repo.owner}/${context.repo.repo}/actions/runs/${context.runId}\n\n` + | |
| `### Action Required\n\n` + | |
| `1. Check website hosting status\n` + | |
| `2. Verify DNS configuration\n` + | |
| `3. Check Cloudflare Pages deployment\n` + | |
| `4. Review recent changes to docs-lumos repository\n\n` + | |
| `### Monitoring\n\n` + | |
| `This issue will automatically update with additional downtime events. ` + | |
| `Close this issue once the website is confirmed to be back online.\n\n` + | |
| `---\n` + | |
| `*This issue was automatically created by the uptime monitoring workflow.*` | |
| }); | |
| console.log('Created new uptime alert issue'); | |
| } | |
| check-crates-io: | |
| name: Check crates.io packages | |
| runs-on: ubuntu-latest | |
| steps: | |
| - name: Check crates.io packages with retry | |
| id: check-crates | |
| run: | | |
| check_crate() { | |
| local crate=$1 | |
| local max_retries=3 | |
| local retry_delay=10 | |
| for i in $(seq 1 $max_retries); do | |
| HTTP_CODE=$(curl -o /dev/null -s -w "%{http_code}" --max-time 30 -H "User-Agent: lumos-uptime-monitor (https://github.com/getlumos/lumos)" "https://crates.io/api/v1/crates/$crate") | |
| if [ "$HTTP_CODE" -eq 200 ]; then | |
| echo "✅ $crate is accessible (attempt $i)" | |
| return 0 | |
| fi | |
| echo "⚠️ $crate returned HTTP $HTTP_CODE (attempt $i/$max_retries)" | |
| if [ $i -lt $max_retries ]; then | |
| echo "Retrying in ${retry_delay}s..." | |
| sleep $retry_delay | |
| fi | |
| done | |
| echo "❌ $crate not accessible after $max_retries attempts" | |
| return 1 | |
| } | |
| FAILED="" | |
| echo "=== Checking lumos-core ===" | |
| if ! check_crate "lumos-core"; then | |
| FAILED="$FAILED lumos-core" | |
| fi | |
| echo "=== Checking lumos-cli ===" | |
| if ! check_crate "lumos-cli"; then | |
| FAILED="$FAILED lumos-cli" | |
| fi | |
| echo "=== Checking lumos-lsp ===" | |
| if ! check_crate "lumos-lsp"; then | |
| FAILED="$FAILED lumos-lsp" | |
| fi | |
| if [ -n "$FAILED" ]; then | |
| echo "failed_crates=$FAILED" >> $GITHUB_OUTPUT | |
| exit 1 | |
| fi | |
| echo "✅ All crates accessible" | |
| - name: Create or update issue on crates.io failure | |
| if: failure() | |
| uses: actions/github-script@v8 | |
| with: | |
| script: | | |
| const timestamp = new Date().toISOString(); | |
| const failedCrates = '${{ steps.check-crates.outputs.failed_crates }}' || 'unknown'; | |
| // Check if an open crates.io issue already exists | |
| const existingIssues = await github.rest.issues.listForRepo({ | |
| owner: context.repo.owner, | |
| repo: context.repo.repo, | |
| state: 'open', | |
| labels: 'uptime-alert', | |
| per_page: 100 | |
| }); | |
| const cratesIssue = existingIssues.data.find( | |
| issue => issue.title.includes('crates.io') | |
| ); | |
| if (cratesIssue) { | |
| // Update existing issue with new failure | |
| await github.rest.issues.createComment({ | |
| owner: context.repo.owner, | |
| repo: context.repo.repo, | |
| issue_number: cratesIssue.number, | |
| body: `🔴 **Another check failure**\n\n` + | |
| `- **Time**: ${timestamp}\n` + | |
| `- **Failed crates**: ${failedCrates}\n` + | |
| `- **Workflow Run**: ${context.serverUrl}/${context.repo.owner}/${context.repo.repo}/actions/runs/${context.runId}` | |
| }); | |
| console.log(`Updated existing issue #${cratesIssue.number}`); | |
| } else { | |
| // Create new issue | |
| await github.rest.issues.create({ | |
| owner: context.repo.owner, | |
| repo: context.repo.repo, | |
| title: '🚨 crates.io packages unreachable', | |
| labels: ['uptime-alert', 'priority:high', 'area:infrastructure'], | |
| body: `## crates.io Downtime Detected\n\n` + | |
| `Automated monitoring detected that LUMOS packages on crates.io are unreachable.\n\n` + | |
| `### Details\n\n` + | |
| `- **Time**: ${timestamp}\n` + | |
| `- **Failed crates**: ${failedCrates}\n` + | |
| `- **Workflow Run**: ${context.serverUrl}/${context.repo.owner}/${context.repo.repo}/actions/runs/${context.runId}\n\n` + | |
| `### Packages to Check\n\n` + | |
| `- lumos-core: https://crates.io/crates/lumos-core\n` + | |
| `- lumos-cli: https://crates.io/crates/lumos-cli\n` + | |
| `- lumos-lsp: https://crates.io/crates/lumos-lsp\n\n` + | |
| `### Possible Causes\n\n` + | |
| `1. crates.io is experiencing downtime\n` + | |
| `2. Packages were yanked or unpublished\n` + | |
| `3. Network issues\n\n` + | |
| `### Monitoring\n\n` + | |
| `This issue will automatically update with additional failures. ` + | |
| `Close this issue once packages are confirmed accessible.\n\n` + | |
| `---\n` + | |
| `*This issue was automatically created by the uptime monitoring workflow.*` | |
| }); | |
| console.log('Created new crates.io alert issue'); | |
| } |