Deployment phase 2: fix CI #168

Workflow file for this run

.github/workflows/app-state-validation.yml at 0f542ba

	name: Platform Validation & E2E Tests

	on:
	push:
	branches:
	- main
	- argocd-gitops-dev
	pull_request:
	branches:
	- main
	- argocd-gitops-dev
	workflow_dispatch:
	inputs:
	cluster_mode:
	description: 'Cluster mode (kind/existing)'
	required: false
	default: 'kind'
	type: choice
	options:
	- kind
	- existing
	exclude_apps:
	description: 'Comma-separated apps to exclude'
	required: false
	default: ''
	only_critical:
	description: 'Only validate critical apps'
	required: false
	default: false
	type: boolean

	# Explicit permissions for fork PRs (required for PR comments)
	permissions:
	contents: read # Read repository files
	pull-requests: write # Comment on PRs
	actions: read # Read workflow runs
	checks: write # Update check status

	env:
	PYTHON_VERSION: '3.11'
	KIND_VERSION: 'v0.20.0'
	KUBECTL_VERSION: 'v1.28.0'
	ARGOCD_VERSION: 'v2.9.3'

	jobs:
	validate-app-state:
	name: Validate ArgoCD Application State
	runs-on: ubuntu-latest
	timeout-minutes: 60 # Extended to 60 minutes for complete platform testing (all apps including optional observability, Kiali, Ollama)

	steps:
	- name: Checkout repository
	uses: actions/checkout@v4

	- name: Set up Python
	uses: actions/setup-python@v5
	with:
	python-version: ${{ env.PYTHON_VERSION }}
	cache: 'pip'

	- name: Install Python dependencies
	run: \|
	pip install --upgrade pip
	pip install kubernetes>=28.1.0 pytest>=8.0.0 pytest-html>=4.1.0 \
	pytest-json-report>=1.5.0 tenacity>=8.2.3 rich>=13.7.0

	- name: Determine cluster mode
	id: cluster_mode
	run: \|
	if [[ "${{ github.event_name }}" == "workflow_dispatch" ]]; then
	echo "mode=${{ inputs.cluster_mode }}" >> $GITHUB_OUTPUT
	else
	echo "mode=kind" >> $GITHUB_OUTPUT
	fi

	- name: Free up disk space (manual cleanup)
	if: steps.cluster_mode.outputs.mode == 'kind'
	run: \|
	echo "🧹 Freeing up disk space by removing pre-installed software..."
	echo "Disk usage BEFORE cleanup:"
	df -h /

	# Remove large pre-installed software
	echo "Removing .NET SDK (~20GB)..."
	sudo rm -rf /usr/share/dotnet \|\| true

	echo "Removing Android SDK (~10GB)..."
	sudo rm -rf /usr/local/lib/android \|\| true

	echo "Removing Haskell (~5GB)..."
	sudo rm -rf /opt/ghc \|\| true
	sudo rm -rf /usr/local/.ghcup \|\| true

	echo "Removing CodeQL (~5GB)..."
	sudo rm -rf /opt/hostedtoolcache/CodeQL \|\| true

	echo "Removing unused tools..."
	sudo rm -rf /usr/local/share/powershell \|\| true
	sudo rm -rf /usr/local/share/chromium \|\| true
	sudo rm -rf /usr/local/lib/node_modules \|\| true

	echo ""
	echo "Disk usage AFTER cleanup:"
	df -h /

	echo ""
	echo "✅ Cleanup complete"

	- name: Set up Docker (for Kind)
	if: steps.cluster_mode.outputs.mode == 'kind'
	uses: docker/setup-buildx-action@v3

	- name: Clean up Docker at workflow start
	if: steps.cluster_mode.outputs.mode == 'kind'
	run: \|
	echo "🧹 Initial cleanup - removing stale Docker resources from previous runs..."
	echo "Disk usage BEFORE cleanup:"
	df -h
	docker system df \|\| true

	# Remove all unused Docker resources from previous runs
	docker system prune -af --volumes \|\| true

	echo ""
	echo "Disk usage AFTER cleanup:"
	df -h
	docker system df \|\| true

	- name: Install Kind
	if: steps.cluster_mode.outputs.mode == 'kind'
	run: \|
	curl -Lo ./kind https://kind.sigs.k8s.io/dl/${{ env.KIND_VERSION }}/kind-linux-amd64
	chmod +x ./kind
	sudo mv ./kind /usr/local/bin/kind
	kind version

	- name: Install kubectl
	run: \|
	curl -LO "https://dl.k8s.io/release/${{ env.KUBECTL_VERSION }}/bin/linux/amd64/kubectl"
	chmod +x kubectl
	sudo mv kubectl /usr/local/bin/
	kubectl version --client

	- name: Install ArgoCD CLI
	run: \|
	cd /tmp
	curl -fsSL -o argocd "https://github.com/argoproj/argo-cd/releases/download/${{ env.ARGOCD_VERSION }}/argocd-linux-amd64"
	chmod +x argocd
	sudo mv argocd /usr/local/bin/argocd
	argocd version --client

	- name: Build and export operator images for CI
	if: steps.cluster_mode.outputs.mode == 'kind'
	run: \|
	echo "🔨 Building operator images for CI (AMD64)..."

	# Clone operator repository
	git clone --depth 1 --branch fix/add-kagenti-operator-image-build \
	https://github.com/Ladas/kagenti-operator /tmp/kagenti-operator

	# Build kagenti-operator
	echo "Building kagenti-operator..."
	cd /tmp/kagenti-operator/kagenti-operator
	docker build -t localhost:5001/kagenti-operator:dev .

	# Build platform-operator
	echo "Building platform-operator..."
	cd /tmp/kagenti-operator/platform-operator
	docker build -t localhost:5001/kagenti-platform-operator:dev .

	# Export as tar files for quick-redeploy.sh
	echo "Exporting operator images as tar files..."
	cd $GITHUB_WORKSPACE
	mkdir -p .images
	docker save localhost:5001/kagenti-operator:dev -o .images/kagenti-operator-dev.tar
	docker save localhost:5001/kagenti-platform-operator:dev -o .images/kagenti-platform-operator-dev.tar

	echo "✅ Operator images built and exported successfully (AMD64)"
	ls -lah .images/

	- name: Clean up Docker to free disk space
	if: steps.cluster_mode.outputs.mode == 'kind'
	run: \|
	echo "🧹 Cleaning up Docker resources to prevent disk exhaustion..."
	echo "Before cleanup:"
	df -h
	docker system df

	# Remove build cache, unused images, and stopped containers
	docker system prune -af --volumes

	# Remove operator repository clone to save space
	rm -rf /tmp/kagenti-operator

	echo ""
	echo "After cleanup:"
	df -h
	docker system df

	- name: Deploy Platform with quick-redeploy.sh
	if: steps.cluster_mode.outputs.mode == 'kind'
	run: \|
	chmod +x ./scripts/quick-redeploy.sh
	# CI mode skips interactive prompts automatically
	./scripts/quick-redeploy.sh

	- name: Wait for ArgoCD applications to sync and become healthy
	run: \|
	echo "Waiting for ArgoCD applications to sync and become healthy..."
	echo "Using enhanced monitoring script with formatted status tables..."
	echo ""

	# Run enhanced monitoring script with 60-minute timeout
	# Script monitors ALL apps (CRITICAL and OPTIONAL) and displays:
	# - Formatted ArgoCD application status table
	# - Formatted pod status by namespace table
	# - Progress tracking with elapsed time
	# - Smart failure logic (only fails on CRITICAL apps degraded)
	./scripts/monitor-argocd-apps.sh 3600

	- name: Set test parameters
	id: test_params
	run: \|
	EXCLUDE_APPS="${{ inputs.exclude_apps }}"
	ONLY_CRITICAL="${{ inputs.only_critical }}"

	# No automatic exclusions - test all apps in CI
	# For manual runs, use workflow_dispatch inputs to exclude apps if needed
	if [[ "${{ github.event_name }}" != "workflow_dispatch" ]]; then
	EXCLUDE_APPS=""
	ONLY_CRITICAL="false"
	fi

	echo "exclude_apps=${EXCLUDE_APPS}" >> $GITHUB_OUTPUT
	echo "only_critical=${ONLY_CRITICAL}" >> $GITHUB_OUTPUT

	- name: Run app state validation
	id: validation
	run: \|
	PYTEST_ARGS="-v --html=app-state-report.html --self-contained-html --json-report --json-report-file=app-state-report.json"

	if [[ -n "${{ steps.test_params.outputs.exclude_apps }}" ]]; then
	PYTEST_ARGS="$PYTEST_ARGS --exclude-app=${{ steps.test_params.outputs.exclude_apps }}"
	fi

	if [[ "${{ steps.test_params.outputs.only_critical }}" == "true" ]]; then
	PYTEST_ARGS="$PYTEST_ARGS --only-critical"
	fi

	pytest tests/validation/test_app_state.py $PYTEST_ARGS

	- name: Capture failing pod logs
	if: failure() # Only run if validation failed
	run: \|
	echo "=== Capturing logs from failing pods ==="
	mkdir -p /tmp/debug-logs

	# Capture logs from all namespaces with CrashLoopBackOff pods
	echo "Finding CrashLoopBackOff pods..."
	kubectl get pods -A -o json \| jq -r '
	.items[] \|
	select(.status.containerStatuses[]? \| select(.state.waiting?.reason == "CrashLoopBackOff")) \|
	"\(.metadata.namespace) \(.metadata.name)"
	' \| while read ns pod; do
	echo "Capturing logs from $ns/$pod"
	kubectl logs -n "$ns" "$pod" --all-containers --tail=200 > "/tmp/debug-logs/${ns}_${pod}.log" 2>&1 \|\| echo "Failed to get logs from $ns/$pod"
	kubectl logs -n "$ns" "$pod" --all-containers --previous --tail=200 > "/tmp/debug-logs/${ns}_${pod}_previous.log" 2>&1 \|\| echo "No previous logs for $ns/$pod"
	done

	# Specifically capture operator logs (even if not crashing)
	echo "Capturing kagenti-operator logs..."
	kubectl logs -n kagenti-operator -l control-plane=controller-manager --tail=200 > /tmp/debug-logs/kagenti-operator-all.log 2>&1 \|\| echo "No kagenti-operator logs"

	echo "Capturing kagenti-platform-operator logs..."
	kubectl logs -n kagenti-platform-operator -l control-plane=controller-manager --tail=200 > /tmp/debug-logs/platform-operator-all.log 2>&1 \|\| echo "No platform-operator logs"

	# Tekton logs
	echo "Capturing tekton logs..."
	kubectl logs -n tekton-pipelines -l app.kubernetes.io/part-of=tekton-pipelines --tail=200 > /tmp/debug-logs/tekton-all.log 2>&1 \|\| echo "No tekton logs"

	# Capture all events sorted by time
	echo "Capturing cluster events..."
	kubectl get events -A --sort-by='.lastTimestamp' > /tmp/debug-logs/all-events.txt 2>&1

	# Capture pod descriptions for failing pods
	echo "Capturing pod descriptions..."
	kubectl describe pods -n kagenti-operator > /tmp/debug-logs/kagenti-operator-describe.txt 2>&1 \|\| true
	kubectl describe pods -n kagenti-platform-operator > /tmp/debug-logs/platform-operator-describe.txt 2>&1 \|\| true
	kubectl describe pods -n tekton-pipelines > /tmp/debug-logs/tekton-describe.txt 2>&1 \|\| true

	# Capture ArgoCD application statuses
	echo "Capturing ArgoCD application details..."
	kubectl get applications -n argocd -o yaml > /tmp/debug-logs/argocd-applications.yaml 2>&1 \|\| true

	echo "=== Debug logs captured ==="
	ls -lah /tmp/debug-logs/

	- name: Upload debug logs
	if: failure() # Only upload if validation failed
	uses: actions/upload-artifact@v4
	with:
	name: crash-debug-logs
	path: /tmp/debug-logs/
	retention-days: 7

	- name: Run E2E platform tests
	id: e2e_tests
	continue-on-error: true
	run: \|
	echo "Running E2E platform tests..."
	PYTEST_ARGS="-v --tb=short --html=e2e-report.html --self-contained-html --json-report --json-report-file=e2e-report.json --continue-on-collection-errors"

	if [[ -n "${{ steps.test_params.outputs.exclude_apps }}" ]]; then
	PYTEST_ARGS="$PYTEST_ARGS --exclude-app=${{ steps.test_params.outputs.exclude_apps }}"
	fi

	pytest tests/e2e/test_platform_e2e.py $PYTEST_ARGS

	- name: Upload test results
	if: always()
	uses: actions/upload-artifact@v4
	with:
	name: validation-results
	path: \|
	app-state-report.html
	app-state-report.json
	e2e-report.html
	e2e-report.json
	retention-days: 30

	- name: Parse test results
	if: always()
	id: parse_results
	run: \|
	# Parse app state validation results
	if [[ -f app-state-report.json ]]; then
	APP_TOTAL=$(jq -r '.summary.total // 0' app-state-report.json)
	APP_PASSED=$(jq -r '.summary.passed // 0' app-state-report.json)
	APP_FAILED=$(jq -r '.summary.failed // 0' app-state-report.json)
	else
	APP_TOTAL=0
	APP_PASSED=0
	APP_FAILED=0
	fi

	# Parse E2E test results
	if [[ -f e2e-report.json ]]; then
	E2E_TOTAL=$(jq -r '.summary.total // 0' e2e-report.json)
	E2E_PASSED=$(jq -r '.summary.passed // 0' e2e-report.json)
	E2E_FAILED=$(jq -r '.summary.failed // 0' e2e-report.json)
	E2E_XFAILED=$(jq -r '.summary.xfailed // 0' e2e-report.json)
	else
	E2E_TOTAL=0
	E2E_PASSED=0
	E2E_FAILED=0
	E2E_XFAILED=0
	fi

	# Combined totals
	TOTAL=$((APP_TOTAL + E2E_TOTAL))
	PASSED=$((APP_PASSED + E2E_PASSED))
	FAILED=$((APP_FAILED + E2E_FAILED))

	echo "total=${TOTAL}" >> $GITHUB_OUTPUT
	echo "passed=${PASSED}" >> $GITHUB_OUTPUT
	echo "failed=${FAILED}" >> $GITHUB_OUTPUT
	echo "app_total=${APP_TOTAL}" >> $GITHUB_OUTPUT
	echo "app_passed=${APP_PASSED}" >> $GITHUB_OUTPUT
	echo "app_failed=${APP_FAILED}" >> $GITHUB_OUTPUT
	echo "e2e_total=${E2E_TOTAL}" >> $GITHUB_OUTPUT
	echo "e2e_passed=${E2E_PASSED}" >> $GITHUB_OUTPUT
	echo "e2e_failed=${E2E_FAILED}" >> $GITHUB_OUTPUT
	echo "e2e_xfailed=${E2E_XFAILED}" >> $GITHUB_OUTPUT

	- name: Comment on PR
	if: github.event_name == 'pull_request' && always()
	continue-on-error: true # Don't fail job if comment fails
	uses: actions/github-script@v7
	with:
	script: \|
	const total = '${{ steps.parse_results.outputs.total }}';
	const passed = '${{ steps.parse_results.outputs.passed }}';
	const failed = '${{ steps.parse_results.outputs.failed }}';

	const appTotal = '${{ steps.parse_results.outputs.app_total }}';
	const appPassed = '${{ steps.parse_results.outputs.app_passed }}';
	const appFailed = '${{ steps.parse_results.outputs.app_failed }}';

	const e2eTotal = '${{ steps.parse_results.outputs.e2e_total }}';
	const e2ePassed = '${{ steps.parse_results.outputs.e2e_passed }}';
	const e2eFailed = '${{ steps.parse_results.outputs.e2e_failed }}';
	const e2eXFailed = '${{ steps.parse_results.outputs.e2e_xfailed }}';

	const status = failed === '0' ? '✅ PASSED' : '❌ FAILED';
	const color = failed === '0' ? '🟢' : '🔴';

	const comment = `## ${color} Platform Validation & E2E Tests ${status}

	### 📋 App State Validation
	- Total: ${appTotal}
	- Passed: ✅ ${appPassed}
	- Failed: ❌ ${appFailed}

	### 🧪 E2E Platform Tests
	- Total: ${e2eTotal}
	- Passed: ✅ ${e2ePassed}
	- Failed: ❌ ${e2eFailed}
	- Expected Failures: ⚠️ ${e2eXFailed}

	### 📊 Combined Results
	- Total Tests: ${total}
	- Passed: ✅ ${passed}
	- Failed: ❌ ${failed}

	Cluster Mode: ${{ steps.cluster_mode.outputs.mode }}
	Excluded Apps: ${{ steps.test_params.outputs.exclude_apps \|\| 'None' }}
	Only Critical: ${{ steps.test_params.outputs.only_critical }}

	📊 [View detailed report](https://github.com/${{ github.repository }}/actions/runs/${{ github.run_id }})
	`;

	github.rest.issues.createComment({
	issue_number: context.issue.number,
	owner: context.repo.owner,
	repo: context.repo.repo,
	body: comment
	});

	- name: Generate summary
	if: always()
	run: \|
	echo "## Platform Validation & E2E Test Results" >> $GITHUB_STEP_SUMMARY
	echo "" >> $GITHUB_STEP_SUMMARY

	echo "### 📋 App State Validation" >> $GITHUB_STEP_SUMMARY
	echo "Status: ${{ steps.validation.outcome }}" >> $GITHUB_STEP_SUMMARY
	echo "Total: ${{ steps.parse_results.outputs.app_total }}" >> $GITHUB_STEP_SUMMARY
	echo "Passed: ${{ steps.parse_results.outputs.app_passed }}" >> $GITHUB_STEP_SUMMARY
	echo "Failed: ${{ steps.parse_results.outputs.app_failed }}" >> $GITHUB_STEP_SUMMARY
	echo "" >> $GITHUB_STEP_SUMMARY

	echo "### 🧪 E2E Platform Tests" >> $GITHUB_STEP_SUMMARY
	echo "Status: ${{ steps.e2e_tests.outcome }}" >> $GITHUB_STEP_SUMMARY
	echo "Total: ${{ steps.parse_results.outputs.e2e_total }}" >> $GITHUB_STEP_SUMMARY
	echo "Passed: ${{ steps.parse_results.outputs.e2e_passed }}" >> $GITHUB_STEP_SUMMARY
	echo "Failed: ${{ steps.parse_results.outputs.e2e_failed }}" >> $GITHUB_STEP_SUMMARY
	echo "Expected Failures: ${{ steps.parse_results.outputs.e2e_xfailed }}" >> $GITHUB_STEP_SUMMARY
	echo "" >> $GITHUB_STEP_SUMMARY

	echo "### 📊 Combined Results" >> $GITHUB_STEP_SUMMARY
	echo "Total Tests: ${{ steps.parse_results.outputs.total }}" >> $GITHUB_STEP_SUMMARY
	echo "Passed: ${{ steps.parse_results.outputs.passed }}" >> $GITHUB_STEP_SUMMARY
	echo "Failed: ${{ steps.parse_results.outputs.failed }}" >> $GITHUB_STEP_SUMMARY
	echo "" >> $GITHUB_STEP_SUMMARY

	echo "Cluster Mode: ${{ steps.cluster_mode.outputs.mode }}" >> $GITHUB_STEP_SUMMARY
	echo "Excluded Apps: ${{ steps.test_params.outputs.exclude_apps \|\| 'None' }}" >> $GITHUB_STEP_SUMMARY
	echo "Only Critical: ${{ steps.test_params.outputs.only_critical }}" >> $GITHUB_STEP_SUMMARY

	- name: Cleanup Kind cluster
	if: always() && steps.cluster_mode.outputs.mode == 'kind'
	run: \|
	kind delete cluster --name kagenti-demo \|\| true

	- name: Fail job if validation failed
	if: steps.validation.outcome == 'failure'
	run: exit 1

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

Deployment phase 2: fix CI #168

Workflow file

Deployment phase 2: fix CI #168

Uh oh!

Workflow file for this run