Deployment phase 2: fix CI #168
Workflow file for this run
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| name: Platform Validation & E2E Tests | |
| on: | |
| push: | |
| branches: | |
| - main | |
| - argocd-gitops-dev | |
| pull_request: | |
| branches: | |
| - main | |
| - argocd-gitops-dev | |
| workflow_dispatch: | |
| inputs: | |
| cluster_mode: | |
| description: 'Cluster mode (kind/existing)' | |
| required: false | |
| default: 'kind' | |
| type: choice | |
| options: | |
| - kind | |
| - existing | |
| exclude_apps: | |
| description: 'Comma-separated apps to exclude' | |
| required: false | |
| default: '' | |
| only_critical: | |
| description: 'Only validate critical apps' | |
| required: false | |
| default: false | |
| type: boolean | |
| # Explicit permissions for fork PRs (required for PR comments) | |
| permissions: | |
| contents: read # Read repository files | |
| pull-requests: write # Comment on PRs | |
| actions: read # Read workflow runs | |
| checks: write # Update check status | |
| env: | |
| PYTHON_VERSION: '3.11' | |
| KIND_VERSION: 'v0.20.0' | |
| KUBECTL_VERSION: 'v1.28.0' | |
| ARGOCD_VERSION: 'v2.9.3' | |
| jobs: | |
| validate-app-state: | |
| name: Validate ArgoCD Application State | |
| runs-on: ubuntu-latest | |
| timeout-minutes: 60 # Extended to 60 minutes for complete platform testing (all apps including optional observability, Kiali, Ollama) | |
| steps: | |
| - name: Checkout repository | |
| uses: actions/checkout@v4 | |
| - name: Set up Python | |
| uses: actions/setup-python@v5 | |
| with: | |
| python-version: ${{ env.PYTHON_VERSION }} | |
| cache: 'pip' | |
| - name: Install Python dependencies | |
| run: | | |
| pip install --upgrade pip | |
| pip install kubernetes>=28.1.0 pytest>=8.0.0 pytest-html>=4.1.0 \ | |
| pytest-json-report>=1.5.0 tenacity>=8.2.3 rich>=13.7.0 | |
| - name: Determine cluster mode | |
| id: cluster_mode | |
| run: | | |
| if [[ "${{ github.event_name }}" == "workflow_dispatch" ]]; then | |
| echo "mode=${{ inputs.cluster_mode }}" >> $GITHUB_OUTPUT | |
| else | |
| echo "mode=kind" >> $GITHUB_OUTPUT | |
| fi | |
| - name: Free up disk space (manual cleanup) | |
| if: steps.cluster_mode.outputs.mode == 'kind' | |
| run: | | |
| echo "🧹 Freeing up disk space by removing pre-installed software..." | |
| echo "Disk usage BEFORE cleanup:" | |
| df -h / | |
| # Remove large pre-installed software | |
| echo "Removing .NET SDK (~20GB)..." | |
| sudo rm -rf /usr/share/dotnet || true | |
| echo "Removing Android SDK (~10GB)..." | |
| sudo rm -rf /usr/local/lib/android || true | |
| echo "Removing Haskell (~5GB)..." | |
| sudo rm -rf /opt/ghc || true | |
| sudo rm -rf /usr/local/.ghcup || true | |
| echo "Removing CodeQL (~5GB)..." | |
| sudo rm -rf /opt/hostedtoolcache/CodeQL || true | |
| echo "Removing unused tools..." | |
| sudo rm -rf /usr/local/share/powershell || true | |
| sudo rm -rf /usr/local/share/chromium || true | |
| sudo rm -rf /usr/local/lib/node_modules || true | |
| echo "" | |
| echo "Disk usage AFTER cleanup:" | |
| df -h / | |
| echo "" | |
| echo "✅ Cleanup complete" | |
| - name: Set up Docker (for Kind) | |
| if: steps.cluster_mode.outputs.mode == 'kind' | |
| uses: docker/setup-buildx-action@v3 | |
| - name: Clean up Docker at workflow start | |
| if: steps.cluster_mode.outputs.mode == 'kind' | |
| run: | | |
| echo "🧹 Initial cleanup - removing stale Docker resources from previous runs..." | |
| echo "Disk usage BEFORE cleanup:" | |
| df -h | |
| docker system df || true | |
| # Remove all unused Docker resources from previous runs | |
| docker system prune -af --volumes || true | |
| echo "" | |
| echo "Disk usage AFTER cleanup:" | |
| df -h | |
| docker system df || true | |
| - name: Install Kind | |
| if: steps.cluster_mode.outputs.mode == 'kind' | |
| run: | | |
| curl -Lo ./kind https://kind.sigs.k8s.io/dl/${{ env.KIND_VERSION }}/kind-linux-amd64 | |
| chmod +x ./kind | |
| sudo mv ./kind /usr/local/bin/kind | |
| kind version | |
| - name: Install kubectl | |
| run: | | |
| curl -LO "https://dl.k8s.io/release/${{ env.KUBECTL_VERSION }}/bin/linux/amd64/kubectl" | |
| chmod +x kubectl | |
| sudo mv kubectl /usr/local/bin/ | |
| kubectl version --client | |
| - name: Install ArgoCD CLI | |
| run: | | |
| cd /tmp | |
| curl -fsSL -o argocd "https://github.com/argoproj/argo-cd/releases/download/${{ env.ARGOCD_VERSION }}/argocd-linux-amd64" | |
| chmod +x argocd | |
| sudo mv argocd /usr/local/bin/argocd | |
| argocd version --client | |
| - name: Build and export operator images for CI | |
| if: steps.cluster_mode.outputs.mode == 'kind' | |
| run: | | |
| echo "🔨 Building operator images for CI (AMD64)..." | |
| # Clone operator repository | |
| git clone --depth 1 --branch fix/add-kagenti-operator-image-build \ | |
| https://github.com/Ladas/kagenti-operator /tmp/kagenti-operator | |
| # Build kagenti-operator | |
| echo "Building kagenti-operator..." | |
| cd /tmp/kagenti-operator/kagenti-operator | |
| docker build -t localhost:5001/kagenti-operator:dev . | |
| # Build platform-operator | |
| echo "Building platform-operator..." | |
| cd /tmp/kagenti-operator/platform-operator | |
| docker build -t localhost:5001/kagenti-platform-operator:dev . | |
| # Export as tar files for quick-redeploy.sh | |
| echo "Exporting operator images as tar files..." | |
| cd $GITHUB_WORKSPACE | |
| mkdir -p .images | |
| docker save localhost:5001/kagenti-operator:dev -o .images/kagenti-operator-dev.tar | |
| docker save localhost:5001/kagenti-platform-operator:dev -o .images/kagenti-platform-operator-dev.tar | |
| echo "✅ Operator images built and exported successfully (AMD64)" | |
| ls -lah .images/ | |
| - name: Clean up Docker to free disk space | |
| if: steps.cluster_mode.outputs.mode == 'kind' | |
| run: | | |
| echo "🧹 Cleaning up Docker resources to prevent disk exhaustion..." | |
| echo "Before cleanup:" | |
| df -h | |
| docker system df | |
| # Remove build cache, unused images, and stopped containers | |
| docker system prune -af --volumes | |
| # Remove operator repository clone to save space | |
| rm -rf /tmp/kagenti-operator | |
| echo "" | |
| echo "After cleanup:" | |
| df -h | |
| docker system df | |
| - name: Deploy Platform with quick-redeploy.sh | |
| if: steps.cluster_mode.outputs.mode == 'kind' | |
| run: | | |
| chmod +x ./scripts/quick-redeploy.sh | |
| # CI mode skips interactive prompts automatically | |
| ./scripts/quick-redeploy.sh | |
| - name: Wait for ArgoCD applications to sync and become healthy | |
| run: | | |
| echo "Waiting for ArgoCD applications to sync and become healthy..." | |
| echo "Using enhanced monitoring script with formatted status tables..." | |
| echo "" | |
| # Run enhanced monitoring script with 60-minute timeout | |
| # Script monitors ALL apps (CRITICAL and OPTIONAL) and displays: | |
| # - Formatted ArgoCD application status table | |
| # - Formatted pod status by namespace table | |
| # - Progress tracking with elapsed time | |
| # - Smart failure logic (only fails on CRITICAL apps degraded) | |
| ./scripts/monitor-argocd-apps.sh 3600 | |
| - name: Set test parameters | |
| id: test_params | |
| run: | | |
| EXCLUDE_APPS="${{ inputs.exclude_apps }}" | |
| ONLY_CRITICAL="${{ inputs.only_critical }}" | |
| # No automatic exclusions - test all apps in CI | |
| # For manual runs, use workflow_dispatch inputs to exclude apps if needed | |
| if [[ "${{ github.event_name }}" != "workflow_dispatch" ]]; then | |
| EXCLUDE_APPS="" | |
| ONLY_CRITICAL="false" | |
| fi | |
| echo "exclude_apps=${EXCLUDE_APPS}" >> $GITHUB_OUTPUT | |
| echo "only_critical=${ONLY_CRITICAL}" >> $GITHUB_OUTPUT | |
| - name: Run app state validation | |
| id: validation | |
| run: | | |
| PYTEST_ARGS="-v --html=app-state-report.html --self-contained-html --json-report --json-report-file=app-state-report.json" | |
| if [[ -n "${{ steps.test_params.outputs.exclude_apps }}" ]]; then | |
| PYTEST_ARGS="$PYTEST_ARGS --exclude-app=${{ steps.test_params.outputs.exclude_apps }}" | |
| fi | |
| if [[ "${{ steps.test_params.outputs.only_critical }}" == "true" ]]; then | |
| PYTEST_ARGS="$PYTEST_ARGS --only-critical" | |
| fi | |
| pytest tests/validation/test_app_state.py $PYTEST_ARGS | |
| - name: Capture failing pod logs | |
| if: failure() # Only run if validation failed | |
| run: | | |
| echo "=== Capturing logs from failing pods ===" | |
| mkdir -p /tmp/debug-logs | |
| # Capture logs from all namespaces with CrashLoopBackOff pods | |
| echo "Finding CrashLoopBackOff pods..." | |
| kubectl get pods -A -o json | jq -r ' | |
| .items[] | | |
| select(.status.containerStatuses[]? | select(.state.waiting?.reason == "CrashLoopBackOff")) | | |
| "\(.metadata.namespace) \(.metadata.name)" | |
| ' | while read ns pod; do | |
| echo "Capturing logs from $ns/$pod" | |
| kubectl logs -n "$ns" "$pod" --all-containers --tail=200 > "/tmp/debug-logs/${ns}_${pod}.log" 2>&1 || echo "Failed to get logs from $ns/$pod" | |
| kubectl logs -n "$ns" "$pod" --all-containers --previous --tail=200 > "/tmp/debug-logs/${ns}_${pod}_previous.log" 2>&1 || echo "No previous logs for $ns/$pod" | |
| done | |
| # Specifically capture operator logs (even if not crashing) | |
| echo "Capturing kagenti-operator logs..." | |
| kubectl logs -n kagenti-operator -l control-plane=controller-manager --tail=200 > /tmp/debug-logs/kagenti-operator-all.log 2>&1 || echo "No kagenti-operator logs" | |
| echo "Capturing kagenti-platform-operator logs..." | |
| kubectl logs -n kagenti-platform-operator -l control-plane=controller-manager --tail=200 > /tmp/debug-logs/platform-operator-all.log 2>&1 || echo "No platform-operator logs" | |
| # Tekton logs | |
| echo "Capturing tekton logs..." | |
| kubectl logs -n tekton-pipelines -l app.kubernetes.io/part-of=tekton-pipelines --tail=200 > /tmp/debug-logs/tekton-all.log 2>&1 || echo "No tekton logs" | |
| # Capture all events sorted by time | |
| echo "Capturing cluster events..." | |
| kubectl get events -A --sort-by='.lastTimestamp' > /tmp/debug-logs/all-events.txt 2>&1 | |
| # Capture pod descriptions for failing pods | |
| echo "Capturing pod descriptions..." | |
| kubectl describe pods -n kagenti-operator > /tmp/debug-logs/kagenti-operator-describe.txt 2>&1 || true | |
| kubectl describe pods -n kagenti-platform-operator > /tmp/debug-logs/platform-operator-describe.txt 2>&1 || true | |
| kubectl describe pods -n tekton-pipelines > /tmp/debug-logs/tekton-describe.txt 2>&1 || true | |
| # Capture ArgoCD application statuses | |
| echo "Capturing ArgoCD application details..." | |
| kubectl get applications -n argocd -o yaml > /tmp/debug-logs/argocd-applications.yaml 2>&1 || true | |
| echo "=== Debug logs captured ===" | |
| ls -lah /tmp/debug-logs/ | |
| - name: Upload debug logs | |
| if: failure() # Only upload if validation failed | |
| uses: actions/upload-artifact@v4 | |
| with: | |
| name: crash-debug-logs | |
| path: /tmp/debug-logs/ | |
| retention-days: 7 | |
| - name: Run E2E platform tests | |
| id: e2e_tests | |
| continue-on-error: true | |
| run: | | |
| echo "Running E2E platform tests..." | |
| PYTEST_ARGS="-v --tb=short --html=e2e-report.html --self-contained-html --json-report --json-report-file=e2e-report.json --continue-on-collection-errors" | |
| if [[ -n "${{ steps.test_params.outputs.exclude_apps }}" ]]; then | |
| PYTEST_ARGS="$PYTEST_ARGS --exclude-app=${{ steps.test_params.outputs.exclude_apps }}" | |
| fi | |
| pytest tests/e2e/test_platform_e2e.py $PYTEST_ARGS | |
| - name: Upload test results | |
| if: always() | |
| uses: actions/upload-artifact@v4 | |
| with: | |
| name: validation-results | |
| path: | | |
| app-state-report.html | |
| app-state-report.json | |
| e2e-report.html | |
| e2e-report.json | |
| retention-days: 30 | |
| - name: Parse test results | |
| if: always() | |
| id: parse_results | |
| run: | | |
| # Parse app state validation results | |
| if [[ -f app-state-report.json ]]; then | |
| APP_TOTAL=$(jq -r '.summary.total // 0' app-state-report.json) | |
| APP_PASSED=$(jq -r '.summary.passed // 0' app-state-report.json) | |
| APP_FAILED=$(jq -r '.summary.failed // 0' app-state-report.json) | |
| else | |
| APP_TOTAL=0 | |
| APP_PASSED=0 | |
| APP_FAILED=0 | |
| fi | |
| # Parse E2E test results | |
| if [[ -f e2e-report.json ]]; then | |
| E2E_TOTAL=$(jq -r '.summary.total // 0' e2e-report.json) | |
| E2E_PASSED=$(jq -r '.summary.passed // 0' e2e-report.json) | |
| E2E_FAILED=$(jq -r '.summary.failed // 0' e2e-report.json) | |
| E2E_XFAILED=$(jq -r '.summary.xfailed // 0' e2e-report.json) | |
| else | |
| E2E_TOTAL=0 | |
| E2E_PASSED=0 | |
| E2E_FAILED=0 | |
| E2E_XFAILED=0 | |
| fi | |
| # Combined totals | |
| TOTAL=$((APP_TOTAL + E2E_TOTAL)) | |
| PASSED=$((APP_PASSED + E2E_PASSED)) | |
| FAILED=$((APP_FAILED + E2E_FAILED)) | |
| echo "total=${TOTAL}" >> $GITHUB_OUTPUT | |
| echo "passed=${PASSED}" >> $GITHUB_OUTPUT | |
| echo "failed=${FAILED}" >> $GITHUB_OUTPUT | |
| echo "app_total=${APP_TOTAL}" >> $GITHUB_OUTPUT | |
| echo "app_passed=${APP_PASSED}" >> $GITHUB_OUTPUT | |
| echo "app_failed=${APP_FAILED}" >> $GITHUB_OUTPUT | |
| echo "e2e_total=${E2E_TOTAL}" >> $GITHUB_OUTPUT | |
| echo "e2e_passed=${E2E_PASSED}" >> $GITHUB_OUTPUT | |
| echo "e2e_failed=${E2E_FAILED}" >> $GITHUB_OUTPUT | |
| echo "e2e_xfailed=${E2E_XFAILED}" >> $GITHUB_OUTPUT | |
| - name: Comment on PR | |
| if: github.event_name == 'pull_request' && always() | |
| continue-on-error: true # Don't fail job if comment fails | |
| uses: actions/github-script@v7 | |
| with: | |
| script: | | |
| const total = '${{ steps.parse_results.outputs.total }}'; | |
| const passed = '${{ steps.parse_results.outputs.passed }}'; | |
| const failed = '${{ steps.parse_results.outputs.failed }}'; | |
| const appTotal = '${{ steps.parse_results.outputs.app_total }}'; | |
| const appPassed = '${{ steps.parse_results.outputs.app_passed }}'; | |
| const appFailed = '${{ steps.parse_results.outputs.app_failed }}'; | |
| const e2eTotal = '${{ steps.parse_results.outputs.e2e_total }}'; | |
| const e2ePassed = '${{ steps.parse_results.outputs.e2e_passed }}'; | |
| const e2eFailed = '${{ steps.parse_results.outputs.e2e_failed }}'; | |
| const e2eXFailed = '${{ steps.parse_results.outputs.e2e_xfailed }}'; | |
| const status = failed === '0' ? '✅ PASSED' : '❌ FAILED'; | |
| const color = failed === '0' ? '🟢' : '🔴'; | |
| const comment = `## ${color} Platform Validation & E2E Tests ${status} | |
| ### 📋 App State Validation | |
| - Total: ${appTotal} | |
| - Passed: ✅ ${appPassed} | |
| - Failed: ❌ ${appFailed} | |
| ### 🧪 E2E Platform Tests | |
| - Total: ${e2eTotal} | |
| - Passed: ✅ ${e2ePassed} | |
| - Failed: ❌ ${e2eFailed} | |
| - Expected Failures: ⚠️ ${e2eXFailed} | |
| ### 📊 Combined Results | |
| - Total Tests: ${total} | |
| - Passed: ✅ ${passed} | |
| - Failed: ❌ ${failed} | |
| **Cluster Mode:** ${{ steps.cluster_mode.outputs.mode }} | |
| **Excluded Apps:** ${{ steps.test_params.outputs.exclude_apps || 'None' }} | |
| **Only Critical:** ${{ steps.test_params.outputs.only_critical }} | |
| 📊 [View detailed report](https://github.com/${{ github.repository }}/actions/runs/${{ github.run_id }}) | |
| `; | |
| github.rest.issues.createComment({ | |
| issue_number: context.issue.number, | |
| owner: context.repo.owner, | |
| repo: context.repo.repo, | |
| body: comment | |
| }); | |
| - name: Generate summary | |
| if: always() | |
| run: | | |
| echo "## Platform Validation & E2E Test Results" >> $GITHUB_STEP_SUMMARY | |
| echo "" >> $GITHUB_STEP_SUMMARY | |
| echo "### 📋 App State Validation" >> $GITHUB_STEP_SUMMARY | |
| echo "**Status:** ${{ steps.validation.outcome }}" >> $GITHUB_STEP_SUMMARY | |
| echo "**Total:** ${{ steps.parse_results.outputs.app_total }}" >> $GITHUB_STEP_SUMMARY | |
| echo "**Passed:** ${{ steps.parse_results.outputs.app_passed }}" >> $GITHUB_STEP_SUMMARY | |
| echo "**Failed:** ${{ steps.parse_results.outputs.app_failed }}" >> $GITHUB_STEP_SUMMARY | |
| echo "" >> $GITHUB_STEP_SUMMARY | |
| echo "### 🧪 E2E Platform Tests" >> $GITHUB_STEP_SUMMARY | |
| echo "**Status:** ${{ steps.e2e_tests.outcome }}" >> $GITHUB_STEP_SUMMARY | |
| echo "**Total:** ${{ steps.parse_results.outputs.e2e_total }}" >> $GITHUB_STEP_SUMMARY | |
| echo "**Passed:** ${{ steps.parse_results.outputs.e2e_passed }}" >> $GITHUB_STEP_SUMMARY | |
| echo "**Failed:** ${{ steps.parse_results.outputs.e2e_failed }}" >> $GITHUB_STEP_SUMMARY | |
| echo "**Expected Failures:** ${{ steps.parse_results.outputs.e2e_xfailed }}" >> $GITHUB_STEP_SUMMARY | |
| echo "" >> $GITHUB_STEP_SUMMARY | |
| echo "### 📊 Combined Results" >> $GITHUB_STEP_SUMMARY | |
| echo "**Total Tests:** ${{ steps.parse_results.outputs.total }}" >> $GITHUB_STEP_SUMMARY | |
| echo "**Passed:** ${{ steps.parse_results.outputs.passed }}" >> $GITHUB_STEP_SUMMARY | |
| echo "**Failed:** ${{ steps.parse_results.outputs.failed }}" >> $GITHUB_STEP_SUMMARY | |
| echo "" >> $GITHUB_STEP_SUMMARY | |
| echo "**Cluster Mode:** ${{ steps.cluster_mode.outputs.mode }}" >> $GITHUB_STEP_SUMMARY | |
| echo "**Excluded Apps:** ${{ steps.test_params.outputs.exclude_apps || 'None' }}" >> $GITHUB_STEP_SUMMARY | |
| echo "**Only Critical:** ${{ steps.test_params.outputs.only_critical }}" >> $GITHUB_STEP_SUMMARY | |
| - name: Cleanup Kind cluster | |
| if: always() && steps.cluster_mode.outputs.mode == 'kind' | |
| run: | | |
| kind delete cluster --name kagenti-demo || true | |
| - name: Fail job if validation failed | |
| if: steps.validation.outcome == 'failure' | |
| run: exit 1 |