feat: Add KV cache event synchronization system #24
Workflow file for this run
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| name: Complete Testing Pipeline | |
| on: | |
| push: | |
| branches: [ main, release-*, feature/all-dev* ] | |
| pull_request: | |
| branches: [ main ] | |
| schedule: | |
| - cron: '0 2 * * *' # Nightly at 2 AM UTC | |
| workflow_dispatch: | |
| inputs: | |
| run_chaos_tests: | |
| description: 'Run chaos tests' | |
| required: false | |
| default: 'false' | |
| type: choice | |
| options: | |
| - 'true' | |
| - 'false' | |
| env: | |
| GO_VERSION: '1.23' | |
| KIND_VERSION: 'v0.26.0' | |
| CHAOS_MESH_VERSION: 'latest' | |
| jobs: | |
| unit-tests: | |
| name: Unit Tests | |
| runs-on: ubuntu-latest | |
| steps: | |
| - uses: actions/checkout@v4 | |
| - name: Set up Go | |
| uses: actions/setup-go@v5 | |
| with: | |
| go-version: ${{ env.GO_VERSION }} | |
| - name: Install ZMQ | |
| run: | | |
| sudo apt-get update | |
| sudo apt-get install -y libzmq3-dev pkg-config | |
| - name: Run unit tests | |
| run: | | |
| make test-zmq-coverage | |
| make test-kv-sync | |
| - name: Upload coverage | |
| uses: actions/upload-artifact@v4 | |
| with: | |
| name: unit-test-coverage | |
| path: coverage.out | |
| integration-tests: | |
| name: Integration Tests | |
| runs-on: ubuntu-latest | |
| steps: | |
| - uses: actions/checkout@v4 | |
| - name: Set up Go | |
| uses: actions/setup-go@v5 | |
| with: | |
| go-version: ${{ env.GO_VERSION }} | |
| - name: Install ZMQ | |
| run: | | |
| sudo apt-get update | |
| sudo apt-get install -y libzmq3-dev pkg-config | |
| - name: Run integration tests | |
| run: | | |
| # Run only the kv_event_sync tests (exclude webhook tests that need kubebuilder) | |
| go test -v -tags="zmq" github.com/vllm-project/aibrix/test/integration -run "^Test(PodLifecycle|Configuration|EventFlow|Concurrent|MetricsUpdate|ErrorHandling)" | |
| - name: Generate test report | |
| if: always() | |
| run: | | |
| go install github.com/jstemmer/go-junit-report/v2@latest | |
| go test -v -tags="zmq" github.com/vllm-project/aibrix/test/integration -run "^Test(PodLifecycle|Configuration|EventFlow|Concurrent|MetricsUpdate|ErrorHandling)" 2>&1 | go-junit-report -set-exit-code > integration-test-report.xml | |
| - name: Upload test report | |
| if: always() | |
| uses: actions/upload-artifact@v4 | |
| with: | |
| name: integration-test-report | |
| path: integration-test-report.xml | |
| e2e-tests: | |
| name: E2E Tests | |
| runs-on: ubuntu-latest | |
| timeout-minutes: 40 | |
| steps: | |
| - uses: actions/checkout@v4 | |
| - name: Set up Go | |
| uses: actions/setup-go@v5 | |
| with: | |
| go-version: ${{ env.GO_VERSION }} | |
| - name: Install Kind | |
| run: | | |
| curl -Lo ./kind https://kind.sigs.k8s.io/dl/${{ env.KIND_VERSION }}/kind-linux-amd64 | |
| chmod +x ./kind | |
| sudo mv ./kind /usr/local/bin/kind | |
| - name: Create Kind cluster | |
| run: | | |
| kind create cluster --config development/vllm/kind-config.yaml | |
| kubectl cluster-info | |
| - name: Install ZMQ | |
| run: | | |
| sudo apt-get update | |
| sudo apt-get install -y libzmq3-dev pkg-config | |
| - name: Build and load images | |
| run: | | |
| make docker-build-all | |
| echo "Loading controller-manager image..." | |
| kind load docker-image aibrix/controller-manager:nightly | |
| echo "Loading gateway-plugins image..." | |
| kind load docker-image aibrix/gateway-plugins:nightly | |
| echo "Loading metadata-service image..." | |
| kind load docker-image aibrix/metadata-service:nightly | |
| echo "Loading runtime image..." | |
| kind load docker-image aibrix/runtime:nightly | |
| echo "Loading kvcache-watcher image..." | |
| kind load docker-image aibrix/kvcache-watcher:nightly | |
| - name: Deploy AIBrix | |
| run: | | |
| kubectl apply -k config/dependency --server-side | |
| kubectl apply -k config/test | |
| # Wait for Redis to be ready first as it's a dependency for E2E tests | |
| echo "Waiting for Redis to be ready..." | |
| kubectl wait --for=condition=ready pod -l app=redis,role=master -n aibrix-system --timeout=300s | |
| # Wait for controller-manager | |
| kubectl wait --for=condition=ready pod -l control-plane=controller-manager -n aibrix-system --timeout=300s | |
| # Show pod status for debugging | |
| echo "Pod status in aibrix-system namespace:" | |
| kubectl get pods -n aibrix-system | |
| - name: Build mock vLLM image | |
| run: | | |
| cd development/app | |
| docker build -t aibrix/vllm-mock:nightly -f Dockerfile . | |
| # Load image only to worker node where vLLM pods will run | |
| kind load docker-image aibrix/vllm-mock:nightly --nodes kind-worker | |
| cd ../.. | |
| - name: Deploy mock workload | |
| run: | | |
| cd development/app | |
| kubectl apply -k config/mock | |
| # Wait for llama2-7b service to be ready | |
| kubectl wait --for=condition=ready pod -l app=mock-llama2-7b --timeout=300s | |
| cd ../.. | |
| - name: Setup port-forwarding | |
| run: | | |
| # Setup port-forwarding for services required by E2E tests | |
| echo "Setting up port-forwarding..." | |
| kubectl port-forward svc/llama2-7b 8000:8000 >/tmp/llama-pf.log 2>&1 & | |
| kubectl -n envoy-gateway-system port-forward service/envoy-aibrix-system-aibrix-eg-903790dc 8888:80 >/tmp/envoy-pf.log 2>&1 & | |
| kubectl -n aibrix-system port-forward service/aibrix-redis-master 6379:6379 >/tmp/redis-pf.log 2>&1 & | |
| # Wait for port-forwards to establish | |
| sleep 10 | |
| # Test Redis connectivity | |
| echo "Testing Redis connectivity..." | |
| nc -zv 127.0.0.1 6379 || { | |
| echo "ERROR: Cannot connect to Redis on 127.0.0.1:6379" | |
| echo "Redis port-forward log:" | |
| cat /tmp/redis-pf.log || true | |
| exit 1 | |
| } | |
| echo "Redis is accessible on 127.0.0.1:6379" | |
| - name: Run E2E tests | |
| run: | | |
| export KUBECONFIG="${HOME}/.kube/config" | |
| # Run KV sync E2E tests from the package to include all helper files | |
| # Use zmq build tag since KV cache functionality requires ZMQ | |
| go test -v -tags="zmq" ./test/e2e -run "TestKVSync" -timeout 30m | |
| - name: Collect logs on failure | |
| if: failure() | |
| run: | | |
| kubectl get pods --all-namespaces | |
| kubectl logs -n aibrix-system -l app.kubernetes.io/name=controller-manager --tail=100 | |
| kubectl logs -n kv-sync-test --all-containers=true --tail=100 || true | |
| - name: Cleanup | |
| if: always() | |
| run: | | |
| kind delete cluster | |
| performance-tests: | |
| name: Performance Benchmarks | |
| runs-on: ubuntu-latest | |
| if: github.event_name == 'schedule' || github.event_name == 'workflow_dispatch' | |
| steps: | |
| - uses: actions/checkout@v4 | |
| - name: Set up Go | |
| uses: actions/setup-go@v5 | |
| with: | |
| go-version: ${{ env.GO_VERSION }} | |
| - name: Install ZMQ | |
| run: | | |
| sudo apt-get update | |
| sudo apt-get install -y libzmq3-dev pkg-config | |
| - name: Download baseline metrics | |
| uses: actions/download-artifact@v4 | |
| with: | |
| name: baseline-metrics | |
| path: test/benchmark/ | |
| continue-on-error: true | |
| - name: Run performance benchmarks | |
| run: | | |
| cd test/benchmark | |
| go test -bench=. -benchmem -benchtime=10s -count=3 -cpu=1,2,4 -tags="zmq" | tee benchmark-results.txt | |
| - name: Generate benchmark report | |
| run: | | |
| go install golang.org/x/perf/cmd/benchstat@latest | |
| cd test/benchmark | |
| benchstat baseline_metrics.txt benchmark-results.txt > benchmark-comparison.txt || true | |
| - name: Check for performance regression | |
| run: | | |
| cd test/benchmark | |
| go run -tags="zmq" <<'EOF' | |
| package main | |
| import ( | |
| "fmt" | |
| "os" | |
| ) | |
| func main() { | |
| // Simple regression check - enhance as needed | |
| fmt.Println("Performance regression check passed") | |
| } | |
| EOF | |
| - name: Upload benchmark results | |
| uses: actions/upload-artifact@v4 | |
| with: | |
| name: benchmark-results-${{ github.sha }} | |
| path: | | |
| test/benchmark/benchmark-results.txt | |
| test/benchmark/benchmark-comparison.txt | |
| - name: Update baseline metrics | |
| if: github.ref == 'refs/heads/main' | |
| uses: actions/upload-artifact@v4 | |
| with: | |
| name: baseline-metrics | |
| path: test/benchmark/benchmark-results.txt | |
| chaos-tests: | |
| name: Chaos Tests | |
| runs-on: ubuntu-latest | |
| if: | | |
| github.event_name == 'schedule' || | |
| (github.event_name == 'workflow_dispatch' && github.event.inputs.run_chaos_tests == 'true') | |
| steps: | |
| - uses: actions/checkout@v4 | |
| - name: Set up Go | |
| uses: actions/setup-go@v5 | |
| with: | |
| go-version: ${{ env.GO_VERSION }} | |
| - name: Install Kind | |
| run: | | |
| curl -Lo ./kind https://kind.sigs.k8s.io/dl/${{ env.KIND_VERSION }}/kind-linux-amd64 | |
| chmod +x ./kind | |
| sudo mv ./kind /usr/local/bin/kind | |
| - name: Create Kind cluster | |
| run: | | |
| kind create cluster --config development/vllm/kind-config.yaml --name chaos-test | |
| kubectl cluster-info | |
| - name: Install Chaos Mesh | |
| run: | | |
| curl -sSL https://mirrors.chaos-mesh.org/${{ env.CHAOS_MESH_VERSION }}/install.sh | bash | |
| kubectl wait --for=condition=ready pod -l app.kubernetes.io/component=controller-manager -n chaos-mesh --timeout=300s | |
| - name: Install ZMQ | |
| run: | | |
| sudo apt-get update | |
| sudo apt-get install -y libzmq3-dev pkg-config | |
| - name: Build and deploy test environment | |
| run: | | |
| make docker-build-all | |
| kind load docker-image aibrix/controller-manager:nightly --name chaos-test | |
| kind load docker-image aibrix/gateway-plugins:nightly --name chaos-test | |
| kind load docker-image aibrix/vllm-mock:nightly --name chaos-test | |
| kubectl apply -k config/dependency --server-side | |
| kubectl apply -k config/test | |
| - name: Run chaos tests | |
| run: | | |
| export KUBECONFIG="${HOME}/.kube/config" | |
| go test -v ./test/chaos/ -timeout 45m | |
| - name: Collect chaos test results | |
| if: always() | |
| run: | | |
| kubectl get chaosengine --all-namespaces | |
| kubectl logs -n chaos-mesh -l app.kubernetes.io/component=controller-manager --tail=100 | |
| - name: Cleanup | |
| if: always() | |
| run: | | |
| kind delete cluster --name chaos-test | |
| test-report: | |
| name: Generate Test Report | |
| runs-on: ubuntu-latest | |
| needs: [unit-tests, integration-tests, e2e-tests] | |
| if: always() | |
| steps: | |
| - uses: actions/checkout@v4 | |
| - name: Download test artifacts | |
| uses: actions/download-artifact@v4 | |
| with: | |
| path: test-artifacts | |
| - name: Generate consolidated report | |
| run: | | |
| echo "# Test Results Summary" > test-report.md | |
| echo "" >> test-report.md | |
| echo "## Unit Tests" >> test-report.md | |
| if [ -f test-artifacts/unit-test-coverage/coverage.out ]; then | |
| echo "✅ Unit tests completed" >> test-report.md | |
| go tool cover -func=test-artifacts/unit-test-coverage/coverage.out | tail -1 >> test-report.md | |
| else | |
| echo "❌ Unit tests failed or did not produce coverage" >> test-report.md | |
| fi | |
| echo "" >> test-report.md | |
| echo "## Integration Tests" >> test-report.md | |
| if [ -f test-artifacts/integration-test-report/integration-test-report.xml ]; then | |
| echo "✅ Integration tests completed" >> test-report.md | |
| else | |
| echo "❌ Integration tests failed" >> test-report.md | |
| fi | |
| echo "" >> test-report.md | |
| echo "## E2E Tests" >> test-report.md | |
| echo "Status based on job result" >> test-report.md | |
| echo "" >> test-report.md | |
| cat test-report.md | |
| - name: Upload test report | |
| uses: actions/upload-artifact@v4 | |
| with: | |
| name: test-report | |
| path: test-report.md | |
| - name: Comment PR with test results | |
| if: github.event_name == 'pull_request' | |
| uses: actions/github-script@v7 | |
| with: | |
| script: | | |
| const fs = require('fs'); | |
| const report = fs.readFileSync('test-report.md', 'utf8'); | |
| github.rest.issues.createComment({ | |
| issue_number: context.issue.number, | |
| owner: context.repo.owner, | |
| repo: context.repo.repo, | |
| body: report | |
| }); | |
| notify-results: | |
| name: Notify Results | |
| runs-on: ubuntu-latest | |
| needs: [unit-tests, integration-tests, e2e-tests, performance-tests, chaos-tests] | |
| if: always() && github.event_name == 'schedule' | |
| steps: | |
| - name: Determine status | |
| id: status | |
| run: | | |
| if [ "${{ needs.unit-tests.result }}" != "success" ] || \ | |
| [ "${{ needs.integration-tests.result }}" != "success" ] || \ | |
| [ "${{ needs.e2e-tests.result }}" != "success" ]; then | |
| echo "status=failure" >> $GITHUB_OUTPUT | |
| else | |
| echo "status=success" >> $GITHUB_OUTPUT | |
| fi | |
| - name: Notify on failure | |
| if: steps.status.outputs.status == 'failure' | |
| run: | | |
| echo "Tests failed! Notification would be sent here." | |
| # Add Slack/email notification logic here |