rhaii-on-xks/.github/workflows/e2e-mock-test.yml at main · opendatahub-io/rhaii-on-xks · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
name: E2E Mock vLLM Test

on:
  pull_request_target:
    types: [labeled, synchronize]
    branches: [main, 'release-*']

permissions: {}

env:
  MOCK_IMAGE: localhost/vllm-mock:test
  MOCK_NAMESPACE: mock-vllm-test
  KIND_CLUSTER_NAME: rhaii-e2e

jobs:
  # Remove run-e2e-test label on new pushes to force re-review
  remove-label-on-push:
    name: Remove e2e label on push
    permissions:
      pull-requests: write
    if: >-
      github.event.action == 'synchronize' &&
      contains(github.event.pull_request.labels.*.name, 'run-e2e-test')
    runs-on: ubuntu-latest
    steps:
      - name: Remove run-e2e-test label
        uses: actions/github-script@60a0d83039c74a4aee543508d2ffcb1c3799cdea # v7.0.1
        with:
          script: |
            await github.rest.issues.removeLabel({
              owner: context.repo.owner,
              repo: context.repo.repo,
              issue_number: context.payload.pull_request.number,
              name: 'run-e2e-test'
            });
            console.log('Removed run-e2e-test label — maintainer must re-add after reviewing new code');

  e2e-test:
    name: E2E Mock vLLM Test
    permissions:
      contents: read
    # Only runs when a maintainer adds the run-e2e-test label specifically.
    # Does not trigger on other labels or on synchronize (new pushes).
    if: github.event.action == 'labeled' && github.event.label.name == 'run-e2e-test'
    runs-on: ubuntu-latest
    steps:
      - name: Checkout PR code
        uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2
        with:
          ref: ${{ github.event.pull_request.head.sha }}
          persist-credentials: false

      - name: Build mock vLLM image
        run: docker build -t ${{ env.MOCK_IMAGE }} test/mock-vllm/

      - name: Install helmfile and helm-diff
        run: |
          HELMFILE_VERSION=0.169.2
          HELMFILE_SHA256="34a5ca9c5fda733f0322f7b12a2959b7de4ab125bcf6531337751e263b027d58"
          curl -fsSL -o /tmp/helmfile.tar.gz \
            "https://github.com/helmfile/helmfile/releases/download/v${HELMFILE_VERSION}/helmfile_${HELMFILE_VERSION}_linux_amd64.tar.gz"
          echo "${HELMFILE_SHA256}  /tmp/helmfile.tar.gz" | sha256sum -c -
          sudo tar xz -C /usr/local/bin helmfile < /tmp/helmfile.tar.gz
          rm /tmp/helmfile.tar.gz
          helm plugin install https://github.com/databus23/helm-diff --version v3.9.14

      - name: Create KinD config
        run: |
          cat > /tmp/kind-config.yaml <<'EOF'
          kind: Cluster
          apiVersion: kind.x-k8s.io/v1alpha4
          nodes:
            - role: control-plane
            - role: worker
          EOF

      - name: Create KinD cluster
        uses: helm/kind-action@a1b0e391336a6ee6713a0583f8c6240d70863de3 # v1.12.0
        with:
          cluster_name: ${{ env.KIND_CLUSTER_NAME }}
          config: /tmp/kind-config.yaml

      - name: Load mock image into KinD
        run: kind load docker-image ${{ env.MOCK_IMAGE }} --name ${{ env.KIND_CLUSTER_NAME }}

      - name: Configure pull credentials on KinD nodes
        env:
          PULL_SECRET: ${{ secrets.RAHII_ON_XKS_PULL_SECRET }}
        run: |
          for node in $(kind get nodes --name "$KIND_CLUSTER_NAME"); do
            docker exec "$node" mkdir -p /var/lib/kubelet
            docker exec -i "$node" tee /var/lib/kubelet/config.json > /dev/null <<< "$PULL_SECRET"
            docker exec "$node" systemctl restart kubelet.service
          done
          kubectl wait --for=condition=Ready nodes --all --timeout=60s

      - name: Configure pull secret for helmfile
        env:
          PULL_SECRET: ${{ secrets.RAHII_ON_XKS_PULL_SECRET }}
        run: |
          mkdir -p ~/.config/containers
          python3 -c "
          import json, os
          data = json.loads(os.environ['PULL_SECRET'])
          with open(os.path.expanduser('~/.config/containers/auth.json'), 'w') as f:
              json.dump(data, f)
          print('Pull secret configured')
          "

      - name: Deploy infrastructure (make deploy-all)
        run: make deploy-all
        timeout-minutes: 15

      - name: Setup inference gateway
        run: |
          # Run setup-gateway.sh but override the Programmed wait for KinD
          # (KinD has no LoadBalancer, so gateway stays AddressNotAssigned)
          ./scripts/setup-gateway.sh || true

          # Verify gateway is Accepted and pod is running
          kubectl wait --for=condition=Accepted gateway/inference-gateway \
            -n opendatahub --timeout=300s
          echo "[OK] Gateway accepted"

          kubectl wait --for=condition=Ready pod \
            -l gateway.networking.k8s.io/gateway-name=inference-gateway \
            -n opendatahub --timeout=300s
          echo "[OK] Gateway pod ready"
        timeout-minutes: 7

      - name: Verify deployment
        run: |
          make status

          echo ""
          echo "=== Verifying components ==="

          kubectl wait --for=condition=Available deployment/cert-manager-webhook \
            -n cert-manager --timeout=120s
          echo "[OK] cert-manager"

          kubectl wait --for=condition=Available deployment -l app=istiod \
            -n istio-system --timeout=120s
          echo "[OK] Istiod"

          kubectl wait --for=condition=Available \
            deployment/kserve-controller-manager -n opendatahub --timeout=120s
          echo "[OK] KServe controller"

          echo ""
          echo "=== All components verified ==="

      - name: Configure mock namespace pull secret
        run: |
          kubectl create namespace ${{ env.MOCK_NAMESPACE }} --dry-run=client -o yaml | kubectl apply -f -
          kubectl create secret docker-registry redhat-pull-secret \
            --from-file=.dockerconfigjson="$HOME/.config/containers/auth.json" \
            -n ${{ env.MOCK_NAMESPACE }} --dry-run=client -o yaml | kubectl apply -f -
          kubectl patch serviceaccount default -n ${{ env.MOCK_NAMESPACE }} \
            -p '{"imagePullSecrets": [{"name": "redhat-pull-secret"}]}'

      - name: Deploy mock model
        run: make deploy-mock-model
        timeout-minutes: 5
        env:
          MOCK_IMAGE: ${{ env.MOCK_IMAGE }}
          IMAGE_PULL_POLICY: IfNotPresent

      - name: Run conformance tests
        run: make test NAMESPACE=${{ env.MOCK_NAMESPACE }}

      - name: Collect debug info on failure
        if: failure()
        run: |
          echo "=== Debug Info ==="
          make status || true
          echo ""
          echo "=== All pods ==="
          kubectl get pods -A || true
          echo ""
          echo "=== Mock namespace ==="
          kubectl get all -n ${{ env.MOCK_NAMESPACE }} || true
          echo ""
          echo "=== LLMInferenceService ==="
          kubectl get llmisvc -A -o yaml || true
          echo ""
          echo "=== Events (mock namespace) ==="
          kubectl get events -n ${{ env.MOCK_NAMESPACE }} --sort-by='.lastTimestamp' || true
          echo ""
          echo "=== Pod logs (mock namespace) ==="
          for pod in $(kubectl get pods -n ${{ env.MOCK_NAMESPACE }} -o name 2>/dev/null); do
            echo "--- $pod ---"
            kubectl logs "$pod" -n ${{ env.MOCK_NAMESPACE }} --all-containers --tail=50 || true
          done

      - name: Clean up mock model
        if: always()
        run: make clean-mock-model || true