Skip to content

Deployment phase 2: fix CI #168

Deployment phase 2: fix CI

Deployment phase 2: fix CI #168

name: Platform Validation & E2E Tests
on:
push:
branches:
- main
- argocd-gitops-dev
pull_request:
branches:
- main
- argocd-gitops-dev
workflow_dispatch:
inputs:
cluster_mode:
description: 'Cluster mode (kind/existing)'
required: false
default: 'kind'
type: choice
options:
- kind
- existing
exclude_apps:
description: 'Comma-separated apps to exclude'
required: false
default: ''
only_critical:
description: 'Only validate critical apps'
required: false
default: false
type: boolean
# Explicit permissions for fork PRs (required for PR comments)
permissions:
contents: read # Read repository files
pull-requests: write # Comment on PRs
actions: read # Read workflow runs
checks: write # Update check status
env:
PYTHON_VERSION: '3.11'
KIND_VERSION: 'v0.20.0'
KUBECTL_VERSION: 'v1.28.0'
ARGOCD_VERSION: 'v2.9.3'
jobs:
validate-app-state:
name: Validate ArgoCD Application State
runs-on: ubuntu-latest
timeout-minutes: 60 # Extended to 60 minutes for complete platform testing (all apps including optional observability, Kiali, Ollama)
steps:
- name: Checkout repository
uses: actions/checkout@v4
- name: Set up Python
uses: actions/setup-python@v5
with:
python-version: ${{ env.PYTHON_VERSION }}
cache: 'pip'
- name: Install Python dependencies
run: |
pip install --upgrade pip
pip install kubernetes>=28.1.0 pytest>=8.0.0 pytest-html>=4.1.0 \
pytest-json-report>=1.5.0 tenacity>=8.2.3 rich>=13.7.0
- name: Determine cluster mode
id: cluster_mode
run: |
if [[ "${{ github.event_name }}" == "workflow_dispatch" ]]; then
echo "mode=${{ inputs.cluster_mode }}" >> $GITHUB_OUTPUT
else
echo "mode=kind" >> $GITHUB_OUTPUT
fi
- name: Free up disk space (manual cleanup)
if: steps.cluster_mode.outputs.mode == 'kind'
run: |
echo "🧹 Freeing up disk space by removing pre-installed software..."
echo "Disk usage BEFORE cleanup:"
df -h /
# Remove large pre-installed software
echo "Removing .NET SDK (~20GB)..."
sudo rm -rf /usr/share/dotnet || true
echo "Removing Android SDK (~10GB)..."
sudo rm -rf /usr/local/lib/android || true
echo "Removing Haskell (~5GB)..."
sudo rm -rf /opt/ghc || true
sudo rm -rf /usr/local/.ghcup || true
echo "Removing CodeQL (~5GB)..."
sudo rm -rf /opt/hostedtoolcache/CodeQL || true
echo "Removing unused tools..."
sudo rm -rf /usr/local/share/powershell || true
sudo rm -rf /usr/local/share/chromium || true
sudo rm -rf /usr/local/lib/node_modules || true
echo ""
echo "Disk usage AFTER cleanup:"
df -h /
echo ""
echo "✅ Cleanup complete"
- name: Set up Docker (for Kind)
if: steps.cluster_mode.outputs.mode == 'kind'
uses: docker/setup-buildx-action@v3
- name: Clean up Docker at workflow start
if: steps.cluster_mode.outputs.mode == 'kind'
run: |
echo "🧹 Initial cleanup - removing stale Docker resources from previous runs..."
echo "Disk usage BEFORE cleanup:"
df -h
docker system df || true
# Remove all unused Docker resources from previous runs
docker system prune -af --volumes || true
echo ""
echo "Disk usage AFTER cleanup:"
df -h
docker system df || true
- name: Install Kind
if: steps.cluster_mode.outputs.mode == 'kind'
run: |
curl -Lo ./kind https://kind.sigs.k8s.io/dl/${{ env.KIND_VERSION }}/kind-linux-amd64
chmod +x ./kind
sudo mv ./kind /usr/local/bin/kind
kind version
- name: Install kubectl
run: |
curl -LO "https://dl.k8s.io/release/${{ env.KUBECTL_VERSION }}/bin/linux/amd64/kubectl"
chmod +x kubectl
sudo mv kubectl /usr/local/bin/
kubectl version --client
- name: Install ArgoCD CLI
run: |
cd /tmp
curl -fsSL -o argocd "https://github.com/argoproj/argo-cd/releases/download/${{ env.ARGOCD_VERSION }}/argocd-linux-amd64"
chmod +x argocd
sudo mv argocd /usr/local/bin/argocd
argocd version --client
- name: Build and export operator images for CI
if: steps.cluster_mode.outputs.mode == 'kind'
run: |
echo "🔨 Building operator images for CI (AMD64)..."
# Clone operator repository
git clone --depth 1 --branch fix/add-kagenti-operator-image-build \
https://github.com/Ladas/kagenti-operator /tmp/kagenti-operator
# Build kagenti-operator
echo "Building kagenti-operator..."
cd /tmp/kagenti-operator/kagenti-operator
docker build -t localhost:5001/kagenti-operator:dev .
# Build platform-operator
echo "Building platform-operator..."
cd /tmp/kagenti-operator/platform-operator
docker build -t localhost:5001/kagenti-platform-operator:dev .
# Export as tar files for quick-redeploy.sh
echo "Exporting operator images as tar files..."
cd $GITHUB_WORKSPACE
mkdir -p .images
docker save localhost:5001/kagenti-operator:dev -o .images/kagenti-operator-dev.tar
docker save localhost:5001/kagenti-platform-operator:dev -o .images/kagenti-platform-operator-dev.tar
echo "✅ Operator images built and exported successfully (AMD64)"
ls -lah .images/
- name: Clean up Docker to free disk space
if: steps.cluster_mode.outputs.mode == 'kind'
run: |
echo "🧹 Cleaning up Docker resources to prevent disk exhaustion..."
echo "Before cleanup:"
df -h
docker system df
# Remove build cache, unused images, and stopped containers
docker system prune -af --volumes
# Remove operator repository clone to save space
rm -rf /tmp/kagenti-operator
echo ""
echo "After cleanup:"
df -h
docker system df
- name: Deploy Platform with quick-redeploy.sh
if: steps.cluster_mode.outputs.mode == 'kind'
run: |
chmod +x ./scripts/quick-redeploy.sh
# CI mode skips interactive prompts automatically
./scripts/quick-redeploy.sh
- name: Wait for ArgoCD applications to sync and become healthy
run: |
echo "Waiting for ArgoCD applications to sync and become healthy..."
echo "Using enhanced monitoring script with formatted status tables..."
echo ""
# Run enhanced monitoring script with 60-minute timeout
# Script monitors ALL apps (CRITICAL and OPTIONAL) and displays:
# - Formatted ArgoCD application status table
# - Formatted pod status by namespace table
# - Progress tracking with elapsed time
# - Smart failure logic (only fails on CRITICAL apps degraded)
./scripts/monitor-argocd-apps.sh 3600
- name: Set test parameters
id: test_params
run: |
EXCLUDE_APPS="${{ inputs.exclude_apps }}"
ONLY_CRITICAL="${{ inputs.only_critical }}"
# No automatic exclusions - test all apps in CI
# For manual runs, use workflow_dispatch inputs to exclude apps if needed
if [[ "${{ github.event_name }}" != "workflow_dispatch" ]]; then
EXCLUDE_APPS=""
ONLY_CRITICAL="false"
fi
echo "exclude_apps=${EXCLUDE_APPS}" >> $GITHUB_OUTPUT
echo "only_critical=${ONLY_CRITICAL}" >> $GITHUB_OUTPUT
- name: Run app state validation
id: validation
run: |
PYTEST_ARGS="-v --html=app-state-report.html --self-contained-html --json-report --json-report-file=app-state-report.json"
if [[ -n "${{ steps.test_params.outputs.exclude_apps }}" ]]; then
PYTEST_ARGS="$PYTEST_ARGS --exclude-app=${{ steps.test_params.outputs.exclude_apps }}"
fi
if [[ "${{ steps.test_params.outputs.only_critical }}" == "true" ]]; then
PYTEST_ARGS="$PYTEST_ARGS --only-critical"
fi
pytest tests/validation/test_app_state.py $PYTEST_ARGS
- name: Capture failing pod logs
if: failure() # Only run if validation failed
run: |
echo "=== Capturing logs from failing pods ==="
mkdir -p /tmp/debug-logs
# Capture logs from all namespaces with CrashLoopBackOff pods
echo "Finding CrashLoopBackOff pods..."
kubectl get pods -A -o json | jq -r '
.items[] |
select(.status.containerStatuses[]? | select(.state.waiting?.reason == "CrashLoopBackOff")) |
"\(.metadata.namespace) \(.metadata.name)"
' | while read ns pod; do
echo "Capturing logs from $ns/$pod"
kubectl logs -n "$ns" "$pod" --all-containers --tail=200 > "/tmp/debug-logs/${ns}_${pod}.log" 2>&1 || echo "Failed to get logs from $ns/$pod"
kubectl logs -n "$ns" "$pod" --all-containers --previous --tail=200 > "/tmp/debug-logs/${ns}_${pod}_previous.log" 2>&1 || echo "No previous logs for $ns/$pod"
done
# Specifically capture operator logs (even if not crashing)
echo "Capturing kagenti-operator logs..."
kubectl logs -n kagenti-operator -l control-plane=controller-manager --tail=200 > /tmp/debug-logs/kagenti-operator-all.log 2>&1 || echo "No kagenti-operator logs"
echo "Capturing kagenti-platform-operator logs..."
kubectl logs -n kagenti-platform-operator -l control-plane=controller-manager --tail=200 > /tmp/debug-logs/platform-operator-all.log 2>&1 || echo "No platform-operator logs"
# Tekton logs
echo "Capturing tekton logs..."
kubectl logs -n tekton-pipelines -l app.kubernetes.io/part-of=tekton-pipelines --tail=200 > /tmp/debug-logs/tekton-all.log 2>&1 || echo "No tekton logs"
# Capture all events sorted by time
echo "Capturing cluster events..."
kubectl get events -A --sort-by='.lastTimestamp' > /tmp/debug-logs/all-events.txt 2>&1
# Capture pod descriptions for failing pods
echo "Capturing pod descriptions..."
kubectl describe pods -n kagenti-operator > /tmp/debug-logs/kagenti-operator-describe.txt 2>&1 || true
kubectl describe pods -n kagenti-platform-operator > /tmp/debug-logs/platform-operator-describe.txt 2>&1 || true
kubectl describe pods -n tekton-pipelines > /tmp/debug-logs/tekton-describe.txt 2>&1 || true
# Capture ArgoCD application statuses
echo "Capturing ArgoCD application details..."
kubectl get applications -n argocd -o yaml > /tmp/debug-logs/argocd-applications.yaml 2>&1 || true
echo "=== Debug logs captured ==="
ls -lah /tmp/debug-logs/
- name: Upload debug logs
if: failure() # Only upload if validation failed
uses: actions/upload-artifact@v4
with:
name: crash-debug-logs
path: /tmp/debug-logs/
retention-days: 7
- name: Run E2E platform tests
id: e2e_tests
continue-on-error: true
run: |
echo "Running E2E platform tests..."
PYTEST_ARGS="-v --tb=short --html=e2e-report.html --self-contained-html --json-report --json-report-file=e2e-report.json --continue-on-collection-errors"
if [[ -n "${{ steps.test_params.outputs.exclude_apps }}" ]]; then
PYTEST_ARGS="$PYTEST_ARGS --exclude-app=${{ steps.test_params.outputs.exclude_apps }}"
fi
pytest tests/e2e/test_platform_e2e.py $PYTEST_ARGS
- name: Upload test results
if: always()
uses: actions/upload-artifact@v4
with:
name: validation-results
path: |
app-state-report.html
app-state-report.json
e2e-report.html
e2e-report.json
retention-days: 30
- name: Parse test results
if: always()
id: parse_results
run: |
# Parse app state validation results
if [[ -f app-state-report.json ]]; then
APP_TOTAL=$(jq -r '.summary.total // 0' app-state-report.json)
APP_PASSED=$(jq -r '.summary.passed // 0' app-state-report.json)
APP_FAILED=$(jq -r '.summary.failed // 0' app-state-report.json)
else
APP_TOTAL=0
APP_PASSED=0
APP_FAILED=0
fi
# Parse E2E test results
if [[ -f e2e-report.json ]]; then
E2E_TOTAL=$(jq -r '.summary.total // 0' e2e-report.json)
E2E_PASSED=$(jq -r '.summary.passed // 0' e2e-report.json)
E2E_FAILED=$(jq -r '.summary.failed // 0' e2e-report.json)
E2E_XFAILED=$(jq -r '.summary.xfailed // 0' e2e-report.json)
else
E2E_TOTAL=0
E2E_PASSED=0
E2E_FAILED=0
E2E_XFAILED=0
fi
# Combined totals
TOTAL=$((APP_TOTAL + E2E_TOTAL))
PASSED=$((APP_PASSED + E2E_PASSED))
FAILED=$((APP_FAILED + E2E_FAILED))
echo "total=${TOTAL}" >> $GITHUB_OUTPUT
echo "passed=${PASSED}" >> $GITHUB_OUTPUT
echo "failed=${FAILED}" >> $GITHUB_OUTPUT
echo "app_total=${APP_TOTAL}" >> $GITHUB_OUTPUT
echo "app_passed=${APP_PASSED}" >> $GITHUB_OUTPUT
echo "app_failed=${APP_FAILED}" >> $GITHUB_OUTPUT
echo "e2e_total=${E2E_TOTAL}" >> $GITHUB_OUTPUT
echo "e2e_passed=${E2E_PASSED}" >> $GITHUB_OUTPUT
echo "e2e_failed=${E2E_FAILED}" >> $GITHUB_OUTPUT
echo "e2e_xfailed=${E2E_XFAILED}" >> $GITHUB_OUTPUT
- name: Comment on PR
if: github.event_name == 'pull_request' && always()
continue-on-error: true # Don't fail job if comment fails
uses: actions/github-script@v7
with:
script: |
const total = '${{ steps.parse_results.outputs.total }}';
const passed = '${{ steps.parse_results.outputs.passed }}';
const failed = '${{ steps.parse_results.outputs.failed }}';
const appTotal = '${{ steps.parse_results.outputs.app_total }}';
const appPassed = '${{ steps.parse_results.outputs.app_passed }}';
const appFailed = '${{ steps.parse_results.outputs.app_failed }}';
const e2eTotal = '${{ steps.parse_results.outputs.e2e_total }}';
const e2ePassed = '${{ steps.parse_results.outputs.e2e_passed }}';
const e2eFailed = '${{ steps.parse_results.outputs.e2e_failed }}';
const e2eXFailed = '${{ steps.parse_results.outputs.e2e_xfailed }}';
const status = failed === '0' ? '✅ PASSED' : '❌ FAILED';
const color = failed === '0' ? '🟢' : '🔴';
const comment = `## ${color} Platform Validation & E2E Tests ${status}
### 📋 App State Validation
- Total: ${appTotal}
- Passed: ✅ ${appPassed}
- Failed: ❌ ${appFailed}
### 🧪 E2E Platform Tests
- Total: ${e2eTotal}
- Passed: ✅ ${e2ePassed}
- Failed: ❌ ${e2eFailed}
- Expected Failures: ⚠️ ${e2eXFailed}
### 📊 Combined Results
- Total Tests: ${total}
- Passed: ✅ ${passed}
- Failed: ❌ ${failed}
**Cluster Mode:** ${{ steps.cluster_mode.outputs.mode }}
**Excluded Apps:** ${{ steps.test_params.outputs.exclude_apps || 'None' }}
**Only Critical:** ${{ steps.test_params.outputs.only_critical }}
📊 [View detailed report](https://github.com/${{ github.repository }}/actions/runs/${{ github.run_id }})
`;
github.rest.issues.createComment({
issue_number: context.issue.number,
owner: context.repo.owner,
repo: context.repo.repo,
body: comment
});
- name: Generate summary
if: always()
run: |
echo "## Platform Validation & E2E Test Results" >> $GITHUB_STEP_SUMMARY
echo "" >> $GITHUB_STEP_SUMMARY
echo "### 📋 App State Validation" >> $GITHUB_STEP_SUMMARY
echo "**Status:** ${{ steps.validation.outcome }}" >> $GITHUB_STEP_SUMMARY
echo "**Total:** ${{ steps.parse_results.outputs.app_total }}" >> $GITHUB_STEP_SUMMARY
echo "**Passed:** ${{ steps.parse_results.outputs.app_passed }}" >> $GITHUB_STEP_SUMMARY
echo "**Failed:** ${{ steps.parse_results.outputs.app_failed }}" >> $GITHUB_STEP_SUMMARY
echo "" >> $GITHUB_STEP_SUMMARY
echo "### 🧪 E2E Platform Tests" >> $GITHUB_STEP_SUMMARY
echo "**Status:** ${{ steps.e2e_tests.outcome }}" >> $GITHUB_STEP_SUMMARY
echo "**Total:** ${{ steps.parse_results.outputs.e2e_total }}" >> $GITHUB_STEP_SUMMARY
echo "**Passed:** ${{ steps.parse_results.outputs.e2e_passed }}" >> $GITHUB_STEP_SUMMARY
echo "**Failed:** ${{ steps.parse_results.outputs.e2e_failed }}" >> $GITHUB_STEP_SUMMARY
echo "**Expected Failures:** ${{ steps.parse_results.outputs.e2e_xfailed }}" >> $GITHUB_STEP_SUMMARY
echo "" >> $GITHUB_STEP_SUMMARY
echo "### 📊 Combined Results" >> $GITHUB_STEP_SUMMARY
echo "**Total Tests:** ${{ steps.parse_results.outputs.total }}" >> $GITHUB_STEP_SUMMARY
echo "**Passed:** ${{ steps.parse_results.outputs.passed }}" >> $GITHUB_STEP_SUMMARY
echo "**Failed:** ${{ steps.parse_results.outputs.failed }}" >> $GITHUB_STEP_SUMMARY
echo "" >> $GITHUB_STEP_SUMMARY
echo "**Cluster Mode:** ${{ steps.cluster_mode.outputs.mode }}" >> $GITHUB_STEP_SUMMARY
echo "**Excluded Apps:** ${{ steps.test_params.outputs.exclude_apps || 'None' }}" >> $GITHUB_STEP_SUMMARY
echo "**Only Critical:** ${{ steps.test_params.outputs.only_critical }}" >> $GITHUB_STEP_SUMMARY
- name: Cleanup Kind cluster
if: always() && steps.cluster_mode.outputs.mode == 'kind'
run: |
kind delete cluster --name kagenti-demo || true
- name: Fail job if validation failed
if: steps.validation.outcome == 'failure'
run: exit 1