refactor(milvus): share lifecycle across stores #4441

Workflow file for this run

.github/workflows/integration-test-k8s.yml at 4fa457f

	name: Integration Test [Kubernetes]

	on:
	pull_request:
	types: [opened, synchronize, reopened, ready_for_review]
	branches:
	- main
	paths-ignore:
	- 'website/**'
	push:
	branches:
	- main
	paths-ignore:
	- 'website/**'
	workflow_dispatch: # Allow manual triggering

	concurrency:
	group: ${{ github.workflow }}-${{ github.ref }}
	cancel-in-progress: true

	jobs:
	# Detect which files changed to determine which profiles to test
	changes:
	uses: ./.github/workflows/ci-changes.yml

	# Determine which profiles need to be tested based on file changes
	determine-profiles:
	needs: changes
	runs-on: ubuntu-latest
	outputs:
	profiles: ${{ steps.set-matrix.outputs.profiles }}
	should_run: ${{ steps.set-matrix.outputs.should_run }}
	steps:
	- id: set-matrix
	run: \|
	# Run the default PR baseline profiles if common e2e code, core code changes, or manual/scheduled trigger
	if [[ "${{ needs.changes.outputs.e2e_common }}" == "true" ]] \|\| \
	[[ "${{ needs.changes.outputs.core }}" == "true" ]] \|\| \
	[[ "${{ needs.changes.outputs.docker }}" == "true" ]] \|\| \
	[[ "${{ needs.changes.outputs.make }}" == "true" ]] \|\| \
	[[ "${{ needs.changes.outputs.ci }}" == "true" ]] \|\| \
	[[ "${{ needs.changes.outputs.agent_exec }}" == "true" ]] \|\| \
	[[ "${{ github.event_name }}" == "schedule" ]] \|\| \
	[[ "${{ github.event_name }}" == "workflow_dispatch" ]]; then
	echo 'profiles=["kubernetes", "dashboard"]' >> $GITHUB_OUTPUT
	echo 'should_run=true' >> $GITHUB_OUTPUT
	echo "Running default baseline profiles due to common/core changes or push/schedule/manual trigger"
	exit 0
	fi

	# Only run affected profiles for PRs
	profiles=()
	[[ "${{ needs.changes.outputs.e2e_istio }}" == "true" ]] && profiles+=("istio")
	[[ "${{ needs.changes.outputs.e2e_kubernetes }}" == "true" ]] && profiles+=("kubernetes")
	[[ "${{ needs.changes.outputs.e2e_aibrix }}" == "true" ]] && profiles+=("aibrix")
	[[ "${{ needs.changes.outputs.e2e_dashboard }}" == "true" ]] && profiles+=("dashboard")
	[[ "${{ needs.changes.outputs.e2e_llm_d }}" == "true" ]] && profiles+=("llm-d")
	[[ "${{ needs.changes.outputs.e2e_routing_strategies }}" == "true" ]] && profiles+=("routing-strategies")
	[[ "${{ needs.changes.outputs.e2e_production_stack }}" == "true" ]] && profiles+=("production-stack")
	[[ "${{ needs.changes.outputs.e2e_dynamic_config }}" == "true" ]] && profiles+=("dynamic-config")
	[[ "${{ needs.changes.outputs.e2e_ml_model_selection }}" == "true" ]] && profiles+=("ml-model-selection")
	[[ "${{ needs.changes.outputs.e2e_multi_endpoint }}" == "true" ]] && profiles+=("multi-endpoint")
	[[ "${{ needs.changes.outputs.e2e_authz_rbac }}" == "true" ]] && profiles+=("authz-rbac")
	[[ "${{ needs.changes.outputs.e2e_streaming }}" == "true" ]] && profiles+=("streaming")

	# Convert to JSON array
	if [ ${#profiles[@]} -eq 0 ]; then
	echo 'profiles=[]' >> $GITHUB_OUTPUT
	echo 'should_run=false' >> $GITHUB_OUTPUT
	echo "No profile changes detected, skipping all e2e tests"
	else
	printf -v json '"%s",' "${profiles[@]}"
	echo "profiles=[${json%,}]" >> $GITHUB_OUTPUT
	echo 'should_run=true' >> $GITHUB_OUTPUT
	echo "Running profiles: ${profiles[*]}"
	fi

	integration-test:
	needs: [changes, determine-profiles]
	if: ${{ needs.determine-profiles.outputs.should_run == 'true' && !github.event.pull_request.draft }}
	runs-on: ubuntu-latest
	timeout-minutes: 90
	env:
	E2E_SEMANTIC_ROUTER_HELM_TIMEOUT: 60m
	strategy:
	fail-fast: false # Continue testing other profiles even if one fails
	matrix:
	# Dynamic profile matrix based on detected changes
	profile: ${{ fromJson(needs.determine-profiles.outputs.profiles) }}

	steps:
	- name: Check out the repo
	uses: actions/checkout@v4

	- name: Free disk space and relocate Docker to /mnt
	run: \|
	echo "=== Before cleanup ==="
	df -h / /mnt

	# Remove large pre-installed toolchains that E2E tests don't need
	sudo rm -rf /usr/local/lib/android /usr/share/dotnet /opt/ghc \
	/usr/local/share/boost /usr/local/graalvm /usr/local/.ghcup \
	/usr/share/swift /usr/local/lib/node_modules 2>/dev/null \|\| true
	sudo docker image prune -af 2>/dev/null \|\| true

	# Move Docker data root to /mnt (75 GB+ free vs ~14 GB on /)
	sudo systemctl stop docker
	sudo mv /var/lib/docker /mnt/docker
	sudo ln -s /mnt/docker /var/lib/docker
	sudo systemctl start docker

	# Redirect temp dir so `kind load docker-image` tarballs land on /mnt
	sudo mkdir -p /mnt/tmp
	sudo chmod 1777 /mnt/tmp

	echo "=== After cleanup ==="
	df -h / /mnt

	- name: Set up Go
	uses: actions/setup-go@v5
	with:
	go-version: '1.24'

	- name: Set up Rust
	uses: actions-rust-lang/setup-rust-toolchain@v1
	with:
	toolchain: "1.90"

	- name: Install system dependencies
	run: \|
	sudo apt-get update
	sudo apt-get install -y \
	make \
	build-essential \
	pkg-config

	- name: Install Kind
	run: \|
	ARCH="$(uname -m)"
	case "$ARCH" in
	x86_64) KIND_ARCH="amd64" ;;
	aarch64) KIND_ARCH="arm64" ;;
	*) echo "unsupported arch: $ARCH" && exit 1 ;;
	esac
	curl --retry 5 --retry-delay 5 --retry-all-errors -Lo ./kind "https://kind.sigs.k8s.io/dl/v0.22.0/kind-linux-${KIND_ARCH}"
	chmod +x ./kind
	sudo mv ./kind /usr/local/bin/kind

	- name: Pre-pull Kind node image
	run: \|
	KIND_NODE_IMAGE="kindest/node:v1.29.2"
	echo "Pre-pulling ${KIND_NODE_IMAGE} with retries..."
	for attempt in 1 2 3 4 5; do
	if docker pull "${KIND_NODE_IMAGE}"; then
	echo "Successfully pulled ${KIND_NODE_IMAGE}"
	break
	fi
	if [ "$attempt" -eq 5 ]; then
	echo "ERROR: Failed to pull ${KIND_NODE_IMAGE} after 5 attempts"
	exit 1
	fi
	echo "Pull attempt ${attempt} failed, retrying in $((attempt * 15))s..."
	sleep $((attempt * 15))
	done

	- name: Download E2E test dependencies
	run: \|
	cd e2e && go mod download

	- name: Build E2E test binary
	run: \|
	make build-e2e


	- name: Run Integration E2E tests (${{ matrix.profile }})
	id: e2e-test
	env:
	TMPDIR: /mnt/tmp
	HF_TOKEN: ${{ secrets.HF_TOKEN }}
	HUGGINGFACE_HUB_TOKEN: ${{ secrets.HF_TOKEN }}
	run: \|
	bash ./e2e/testing/stream_semantic_router_logs.sh &
	ROUTER_LOG_STREAM_PID=$!
	cleanup_router_log_stream() {
	if kill -0 "${ROUTER_LOG_STREAM_PID}" 2>/dev/null; then
	echo "[router-live] stopping background router log stream (pid ${ROUTER_LOG_STREAM_PID})"
	kill "${ROUTER_LOG_STREAM_PID}" 2>/dev/null \|\| true
	wait "${ROUTER_LOG_STREAM_PID}" 2>/dev/null \|\| true
	fi
	}
	trap cleanup_router_log_stream EXIT

	set +e
	if [ "${{ matrix.profile }}" = "kubernetes" ]; then
	# Temporarily skip the stress / pressure coverage until the suite is stable again.
	KUBERNETES_CI_TESTS="chat-completions-request,apiserver-runtime-config-endpoints,domain-classify,semantic-cache,pii-detection,jailbreak-detection,decision-priority-selection,plugin-chain-execution,rule-condition-logic,decision-fallback-behavior,plugin-config-variations"
	make e2e-test E2E_PROFILE=${{ matrix.profile }} E2E_TESTS="${KUBERNETES_CI_TESTS}" E2E_VERBOSE=true E2E_KEEP_CLUSTER=false
	TEST_EXIT_CODE=$?
	set -e
	echo "test_exit_code=${TEST_EXIT_CODE}" >> $GITHUB_OUTPUT
	exit ${TEST_EXIT_CODE}
	fi

	make e2e-test E2E_PROFILE=${{ matrix.profile }} E2E_VERBOSE=true E2E_KEEP_CLUSTER=false
	TEST_EXIT_CODE=$?
	set -e
	echo "test_exit_code=${TEST_EXIT_CODE}" >> $GITHUB_OUTPUT
	exit ${TEST_EXIT_CODE}


	- name: Collect logs via kubectl (fallback)
	if: always()
	run: \|
	if [ ! -f "semantic-router-logs.txt" ]; then
	echo "⚠️ semantic-router-logs.txt not found, collecting logs via kubectl as fallback..."
	echo "========================================" > semantic-router-logs.txt
	echo "Semantic Router Logs (collected via kubectl fallback)" >> semantic-router-logs.txt
	echo "========================================" >> semantic-router-logs.txt
	echo "" >> semantic-router-logs.txt

	for pod in $(kubectl get pods -n vllm-semantic-router-system -o jsonpath='{.items[*].metadata.name}' 2>/dev/null); do
	echo "=== Pod: $pod ===" >> semantic-router-logs.txt
	kubectl describe pod "$pod" -n vllm-semantic-router-system >> semantic-router-logs.txt 2>&1 \|\| true
	echo "--- Logs ---" >> semantic-router-logs.txt
	kubectl logs "$pod" -n vllm-semantic-router-system --all-containers=true >> semantic-router-logs.txt 2>&1 \|\| true
	echo "" >> semantic-router-logs.txt
	done

	echo "✅ Fallback log collection complete"
	else
	echo "✅ semantic-router-logs.txt already exists (collected by Go framework)"
	fi

	- name: Upload test reports
	if: always()
	uses: actions/upload-artifact@v4
	with:
	name: test-reports-${{ matrix.profile }}
	path: \|
	test-report.json
	test-report.md
	semantic-router-logs.txt
	response-api-artifacts/**
	retention-days: 30

	- name: Create test summary from report
	if: always()
	run: \|
	if [ -f "test-report.md" ]; then
	echo "=== Reading test report from test-report.md ==="
	cat test-report.md >> $GITHUB_STEP_SUMMARY

	# Add semantic-router logs section if available
	if [ -f "semantic-router-logs.txt" ]; then
	{
	printf '\n---\n\n### 📝 Semantic Router Logs\n\n<details>\n<summary>Click to view semantic-router logs</summary>\n\n```\n'
	} >> "$GITHUB_STEP_SUMMARY"
	# Add first 500 lines of logs to summary (to avoid exceeding GitHub limits)
	head -n 500 semantic-router-logs.txt >> $GITHUB_STEP_SUMMARY

	# Check if there are more lines
	TOTAL_LINES=$(wc -l < semantic-router-logs.txt)
	if [ "$TOTAL_LINES" -gt 500 ]; then
	{
	printf '\n... (showing first 500 lines of %s total lines)\n\n' "$TOTAL_LINES"
	printf '📦 Full logs are available in the workflow artifacts: semantic-router-logs.txt\n'
	} >> "$GITHUB_STEP_SUMMARY"
	fi

	{
	printf '```\n\n</details>\n'
	} >> "$GITHUB_STEP_SUMMARY"
	fi

	# Add additional context
	{
	printf '\n---\n\n### 📚 Additional Resources\n\n'
	printf -- '- Profile: `%s`\n' "${{ matrix.profile }}"
	printf -- '- Trigger: %s\n' "${{ github.event_name }}"
	printf -- '- Branch: `%s`\n' "${{ github.ref_name }}"
	printf -- '- Commit: `%s`\n' "${{ github.sha }}"
	printf -- '- Workflow Run: [%s](%s/%s/actions/runs/%s)\n' "${{ github.run_id }}" "${{ github.server_url }}" "${{ github.repository }}" "${{ github.run_id }}"
	printf -- '- [E2E Test Framework Documentation](https://github.com/%s/tree/main/e2e)\n' "${{ github.repository }}"
	printf -- '- [%s Profile](https://github.com/%s/tree/main/e2e/profiles/%s)\n' "${{ matrix.profile }}" "${{ github.repository }}" "${{ matrix.profile }}"
	printf '\n### 📦 Artifacts\n\n'
	printf -- '- test-report.json - Detailed test results in JSON format\n'
	printf -- '- test-report.md - Human-readable test report\n'
	printf -- '- semantic-router-logs.txt - Complete semantic-router pod logs\n'
	printf -- '- All artifacts are retained for 30 days as `test-reports-%s`\n' "${{ matrix.profile }}"
	} >> "$GITHUB_STEP_SUMMARY"
	else
	echo "⚠️ Test report file not found!" >> $GITHUB_STEP_SUMMARY
	echo "" >> $GITHUB_STEP_SUMMARY
	echo "The E2E test framework did not generate a report file." >> $GITHUB_STEP_SUMMARY
	echo "This might indicate that the test failed before report generation." >> $GITHUB_STEP_SUMMARY
	fi

	- name: Clean up
	if: always()
	run: \|
	make e2e-cleanup \|\| true

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

refactor(milvus): share lifecycle across stores #4441

Workflow file

refactor(milvus): share lifecycle across stores #4441

Uh oh!

Workflow file for this run