Skip to content

refactor(milvus): share lifecycle across stores #4441

refactor(milvus): share lifecycle across stores

refactor(milvus): share lifecycle across stores #4441

name: Integration Test [Kubernetes]
on:
pull_request:
types: [opened, synchronize, reopened, ready_for_review]
branches:
- main
paths-ignore:
- 'website/**'
push:
branches:
- main
paths-ignore:
- 'website/**'
workflow_dispatch: # Allow manual triggering
concurrency:
group: ${{ github.workflow }}-${{ github.ref }}
cancel-in-progress: true
jobs:
# Detect which files changed to determine which profiles to test
changes:
uses: ./.github/workflows/ci-changes.yml
# Determine which profiles need to be tested based on file changes
determine-profiles:
needs: changes
runs-on: ubuntu-latest
outputs:
profiles: ${{ steps.set-matrix.outputs.profiles }}
should_run: ${{ steps.set-matrix.outputs.should_run }}
steps:
- id: set-matrix
run: |
# Run the default PR baseline profiles if common e2e code, core code changes, or manual/scheduled trigger
if [[ "${{ needs.changes.outputs.e2e_common }}" == "true" ]] || \
[[ "${{ needs.changes.outputs.core }}" == "true" ]] || \
[[ "${{ needs.changes.outputs.docker }}" == "true" ]] || \
[[ "${{ needs.changes.outputs.make }}" == "true" ]] || \
[[ "${{ needs.changes.outputs.ci }}" == "true" ]] || \
[[ "${{ needs.changes.outputs.agent_exec }}" == "true" ]] || \
[[ "${{ github.event_name }}" == "schedule" ]] || \
[[ "${{ github.event_name }}" == "workflow_dispatch" ]]; then
echo 'profiles=["kubernetes", "dashboard"]' >> $GITHUB_OUTPUT
echo 'should_run=true' >> $GITHUB_OUTPUT
echo "Running default baseline profiles due to common/core changes or push/schedule/manual trigger"
exit 0
fi
# Only run affected profiles for PRs
profiles=()
[[ "${{ needs.changes.outputs.e2e_istio }}" == "true" ]] && profiles+=("istio")
[[ "${{ needs.changes.outputs.e2e_kubernetes }}" == "true" ]] && profiles+=("kubernetes")
[[ "${{ needs.changes.outputs.e2e_aibrix }}" == "true" ]] && profiles+=("aibrix")
[[ "${{ needs.changes.outputs.e2e_dashboard }}" == "true" ]] && profiles+=("dashboard")
[[ "${{ needs.changes.outputs.e2e_llm_d }}" == "true" ]] && profiles+=("llm-d")
[[ "${{ needs.changes.outputs.e2e_routing_strategies }}" == "true" ]] && profiles+=("routing-strategies")
[[ "${{ needs.changes.outputs.e2e_production_stack }}" == "true" ]] && profiles+=("production-stack")
[[ "${{ needs.changes.outputs.e2e_dynamic_config }}" == "true" ]] && profiles+=("dynamic-config")
[[ "${{ needs.changes.outputs.e2e_ml_model_selection }}" == "true" ]] && profiles+=("ml-model-selection")
[[ "${{ needs.changes.outputs.e2e_multi_endpoint }}" == "true" ]] && profiles+=("multi-endpoint")
[[ "${{ needs.changes.outputs.e2e_authz_rbac }}" == "true" ]] && profiles+=("authz-rbac")
[[ "${{ needs.changes.outputs.e2e_streaming }}" == "true" ]] && profiles+=("streaming")
# Convert to JSON array
if [ ${#profiles[@]} -eq 0 ]; then
echo 'profiles=[]' >> $GITHUB_OUTPUT
echo 'should_run=false' >> $GITHUB_OUTPUT
echo "No profile changes detected, skipping all e2e tests"
else
printf -v json '"%s",' "${profiles[@]}"
echo "profiles=[${json%,}]" >> $GITHUB_OUTPUT
echo 'should_run=true' >> $GITHUB_OUTPUT
echo "Running profiles: ${profiles[*]}"
fi
integration-test:
needs: [changes, determine-profiles]
if: ${{ needs.determine-profiles.outputs.should_run == 'true' && !github.event.pull_request.draft }}
runs-on: ubuntu-latest
timeout-minutes: 90
env:
E2E_SEMANTIC_ROUTER_HELM_TIMEOUT: 60m
strategy:
fail-fast: false # Continue testing other profiles even if one fails
matrix:
# Dynamic profile matrix based on detected changes
profile: ${{ fromJson(needs.determine-profiles.outputs.profiles) }}
steps:
- name: Check out the repo
uses: actions/checkout@v4
- name: Free disk space and relocate Docker to /mnt
run: |
echo "=== Before cleanup ==="
df -h / /mnt
# Remove large pre-installed toolchains that E2E tests don't need
sudo rm -rf /usr/local/lib/android /usr/share/dotnet /opt/ghc \
/usr/local/share/boost /usr/local/graalvm /usr/local/.ghcup \
/usr/share/swift /usr/local/lib/node_modules 2>/dev/null || true
sudo docker image prune -af 2>/dev/null || true
# Move Docker data root to /mnt (75 GB+ free vs ~14 GB on /)
sudo systemctl stop docker
sudo mv /var/lib/docker /mnt/docker
sudo ln -s /mnt/docker /var/lib/docker
sudo systemctl start docker
# Redirect temp dir so `kind load docker-image` tarballs land on /mnt
sudo mkdir -p /mnt/tmp
sudo chmod 1777 /mnt/tmp
echo "=== After cleanup ==="
df -h / /mnt
- name: Set up Go
uses: actions/setup-go@v5
with:
go-version: '1.24'
- name: Set up Rust
uses: actions-rust-lang/setup-rust-toolchain@v1
with:
toolchain: "1.90"
- name: Install system dependencies
run: |
sudo apt-get update
sudo apt-get install -y \
make \
build-essential \
pkg-config
- name: Install Kind
run: |
ARCH="$(uname -m)"
case "$ARCH" in
x86_64) KIND_ARCH="amd64" ;;
aarch64) KIND_ARCH="arm64" ;;
*) echo "unsupported arch: $ARCH" && exit 1 ;;
esac
curl --retry 5 --retry-delay 5 --retry-all-errors -Lo ./kind "https://kind.sigs.k8s.io/dl/v0.22.0/kind-linux-${KIND_ARCH}"
chmod +x ./kind
sudo mv ./kind /usr/local/bin/kind
- name: Pre-pull Kind node image
run: |
KIND_NODE_IMAGE="kindest/node:v1.29.2"
echo "Pre-pulling ${KIND_NODE_IMAGE} with retries..."
for attempt in 1 2 3 4 5; do
if docker pull "${KIND_NODE_IMAGE}"; then
echo "Successfully pulled ${KIND_NODE_IMAGE}"
break
fi
if [ "$attempt" -eq 5 ]; then
echo "ERROR: Failed to pull ${KIND_NODE_IMAGE} after 5 attempts"
exit 1
fi
echo "Pull attempt ${attempt} failed, retrying in $((attempt * 15))s..."
sleep $((attempt * 15))
done
- name: Download E2E test dependencies
run: |
cd e2e && go mod download
- name: Build E2E test binary
run: |
make build-e2e
- name: Run Integration E2E tests (${{ matrix.profile }})
id: e2e-test
env:
TMPDIR: /mnt/tmp
HF_TOKEN: ${{ secrets.HF_TOKEN }}
HUGGINGFACE_HUB_TOKEN: ${{ secrets.HF_TOKEN }}
run: |
bash ./e2e/testing/stream_semantic_router_logs.sh &
ROUTER_LOG_STREAM_PID=$!
cleanup_router_log_stream() {
if kill -0 "${ROUTER_LOG_STREAM_PID}" 2>/dev/null; then
echo "[router-live] stopping background router log stream (pid ${ROUTER_LOG_STREAM_PID})"
kill "${ROUTER_LOG_STREAM_PID}" 2>/dev/null || true
wait "${ROUTER_LOG_STREAM_PID}" 2>/dev/null || true
fi
}
trap cleanup_router_log_stream EXIT
set +e
if [ "${{ matrix.profile }}" = "kubernetes" ]; then
# Temporarily skip the stress / pressure coverage until the suite is stable again.
KUBERNETES_CI_TESTS="chat-completions-request,apiserver-runtime-config-endpoints,domain-classify,semantic-cache,pii-detection,jailbreak-detection,decision-priority-selection,plugin-chain-execution,rule-condition-logic,decision-fallback-behavior,plugin-config-variations"
make e2e-test E2E_PROFILE=${{ matrix.profile }} E2E_TESTS="${KUBERNETES_CI_TESTS}" E2E_VERBOSE=true E2E_KEEP_CLUSTER=false
TEST_EXIT_CODE=$?
set -e
echo "test_exit_code=${TEST_EXIT_CODE}" >> $GITHUB_OUTPUT
exit ${TEST_EXIT_CODE}
fi
make e2e-test E2E_PROFILE=${{ matrix.profile }} E2E_VERBOSE=true E2E_KEEP_CLUSTER=false
TEST_EXIT_CODE=$?
set -e
echo "test_exit_code=${TEST_EXIT_CODE}" >> $GITHUB_OUTPUT
exit ${TEST_EXIT_CODE}
- name: Collect logs via kubectl (fallback)
if: always()
run: |
if [ ! -f "semantic-router-logs.txt" ]; then
echo "⚠️ semantic-router-logs.txt not found, collecting logs via kubectl as fallback..."
echo "========================================" > semantic-router-logs.txt
echo "Semantic Router Logs (collected via kubectl fallback)" >> semantic-router-logs.txt
echo "========================================" >> semantic-router-logs.txt
echo "" >> semantic-router-logs.txt
for pod in $(kubectl get pods -n vllm-semantic-router-system -o jsonpath='{.items[*].metadata.name}' 2>/dev/null); do
echo "=== Pod: $pod ===" >> semantic-router-logs.txt
kubectl describe pod "$pod" -n vllm-semantic-router-system >> semantic-router-logs.txt 2>&1 || true
echo "--- Logs ---" >> semantic-router-logs.txt
kubectl logs "$pod" -n vllm-semantic-router-system --all-containers=true >> semantic-router-logs.txt 2>&1 || true
echo "" >> semantic-router-logs.txt
done
echo "✅ Fallback log collection complete"
else
echo "✅ semantic-router-logs.txt already exists (collected by Go framework)"
fi
- name: Upload test reports
if: always()
uses: actions/upload-artifact@v4
with:
name: test-reports-${{ matrix.profile }}
path: |
test-report.json
test-report.md
semantic-router-logs.txt
response-api-artifacts/**
retention-days: 30
- name: Create test summary from report
if: always()
run: |
if [ -f "test-report.md" ]; then
echo "=== Reading test report from test-report.md ==="
cat test-report.md >> $GITHUB_STEP_SUMMARY
# Add semantic-router logs section if available
if [ -f "semantic-router-logs.txt" ]; then
{
printf '\n---\n\n### 📝 Semantic Router Logs\n\n<details>\n<summary>Click to view semantic-router logs</summary>\n\n```\n'
} >> "$GITHUB_STEP_SUMMARY"
# Add first 500 lines of logs to summary (to avoid exceeding GitHub limits)
head -n 500 semantic-router-logs.txt >> $GITHUB_STEP_SUMMARY
# Check if there are more lines
TOTAL_LINES=$(wc -l < semantic-router-logs.txt)
if [ "$TOTAL_LINES" -gt 500 ]; then
{
printf '\n... (showing first 500 lines of %s total lines)\n\n' "$TOTAL_LINES"
printf '📦 Full logs are available in the workflow artifacts: semantic-router-logs.txt\n'
} >> "$GITHUB_STEP_SUMMARY"
fi
{
printf '```\n\n</details>\n'
} >> "$GITHUB_STEP_SUMMARY"
fi
# Add additional context
{
printf '\n---\n\n### 📚 Additional Resources\n\n'
printf -- '- **Profile:** `%s`\n' "${{ matrix.profile }}"
printf -- '- **Trigger:** %s\n' "${{ github.event_name }}"
printf -- '- **Branch:** `%s`\n' "${{ github.ref_name }}"
printf -- '- **Commit:** `%s`\n' "${{ github.sha }}"
printf -- '- **Workflow Run:** [%s](%s/%s/actions/runs/%s)\n' "${{ github.run_id }}" "${{ github.server_url }}" "${{ github.repository }}" "${{ github.run_id }}"
printf -- '- [E2E Test Framework Documentation](https://github.com/%s/tree/main/e2e)\n' "${{ github.repository }}"
printf -- '- [%s Profile](https://github.com/%s/tree/main/e2e/profiles/%s)\n' "${{ matrix.profile }}" "${{ github.repository }}" "${{ matrix.profile }}"
printf '\n### 📦 Artifacts\n\n'
printf -- '- **test-report.json** - Detailed test results in JSON format\n'
printf -- '- **test-report.md** - Human-readable test report\n'
printf -- '- **semantic-router-logs.txt** - Complete semantic-router pod logs\n'
printf -- '- All artifacts are retained for 30 days as `test-reports-%s`\n' "${{ matrix.profile }}"
} >> "$GITHUB_STEP_SUMMARY"
else
echo "⚠️ Test report file not found!" >> $GITHUB_STEP_SUMMARY
echo "" >> $GITHUB_STEP_SUMMARY
echo "The E2E test framework did not generate a report file." >> $GITHUB_STEP_SUMMARY
echo "This might indicate that the test failed before report generation." >> $GITHUB_STEP_SUMMARY
fi
- name: Clean up
if: always()
run: |
make e2e-cleanup || true