Skip to content

Commit 65579cb

Browse files
authored
feat: add metrics validation in e2e test (llm-d#529)
Signed-off-by: CYJiang <googs1025@gmail.com>
1 parent 9a55e4f commit 65579cb

File tree

4 files changed

+90
-2
lines changed

4 files changed

+90
-2
lines changed

test/e2e/e2e_suite_test.go

Lines changed: 28 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -54,7 +54,8 @@ const (
5454
)
5555

5656
var (
57-
port string = env.GetEnvString("E2E_PORT", "30080", ginkgo.GinkgoLogr)
57+
port string = env.GetEnvString("E2E_PORT", "30080", ginkgo.GinkgoLogr)
58+
metricsPort string = env.GetEnvString("E2E_METRICS_PORT", "32090", ginkgo.GinkgoLogr)
5859

5960
testConfig *testutils.TestConfig
6061

@@ -80,7 +81,8 @@ var (
8081
infPoolObjects []string
8182
createdNameSpace bool
8283

83-
portForwardSession *gexec.Session
84+
portForwardSession *gexec.Session
85+
eppPortForwardSession *gexec.Session
8486
)
8587

8688
func TestEndToEnd(t *testing.T) {
@@ -115,6 +117,10 @@ var _ = ginkgo.AfterSuite(func() {
115117
portForwardSession.Terminate()
116118
}
117119

120+
if eppPortForwardSession != nil {
121+
eppPortForwardSession.Terminate()
122+
}
123+
118124
// cleanup created objects
119125
ginkgo.By("Deleting created Kubernetes objects")
120126
testutils.DeleteObjects(testConfig, infPoolObjects)
@@ -149,6 +155,7 @@ func setupK8sCluster() {
149155
gomega.Expect(err).ShouldNot(gomega.HaveOccurred())
150156
}()
151157
clusterConfig := strings.ReplaceAll(kindClusterConfig, "${PORT}", port)
158+
clusterConfig = strings.ReplaceAll(clusterConfig, "${METRICS_PORT}", metricsPort)
152159
_, err := io.WriteString(stdin, clusterConfig)
153160
gomega.Expect(err).ShouldNot(gomega.HaveOccurred())
154161
}()
@@ -284,6 +291,22 @@ func createInferencePool(numTargetPorts int, toDelete bool) []string {
284291
return testutils.CreateObjsFromYaml(testConfig, infPoolYaml)
285292
}
286293

294+
func startEPPMetricsPortForward() {
295+
pods, err := testConfig.KubeCli.CoreV1().Pods(nsName).List(testConfig.Context, metav1.ListOptions{
296+
LabelSelector: "app=e2e-epp",
297+
})
298+
gomega.Expect(err).NotTo(gomega.HaveOccurred())
299+
gomega.Expect(pods.Items).NotTo(gomega.BeEmpty())
300+
301+
eppPodName := pods.Items[0].Name
302+
command := exec.Command("kubectl", "port-forward", "pod/"+eppPodName, metricsPort+":9090",
303+
"--context="+k8sContext, "--namespace="+nsName)
304+
eppPortForwardSession, err = gexec.Start(command, ginkgo.GinkgoWriter, ginkgo.GinkgoWriter)
305+
gomega.Expect(err).ShouldNot(gomega.HaveOccurred())
306+
// Give it a moment to establish
307+
time.Sleep(3 * time.Second)
308+
}
309+
287310
const kindClusterConfig = `
288311
kind: Cluster
289312
apiVersion: kind.x-k8s.io/v1alpha4
@@ -295,4 +318,7 @@ nodes:
295318
- containerPort: 30081
296319
hostPort: 30081
297320
protocol: TCP
321+
- containerPort: 32090
322+
hostPort: ${METRICS_PORT}
323+
protocol: TCP
298324
`

test/e2e/e2e_test.go

Lines changed: 46 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -2,8 +2,10 @@ package e2e
22

33
import (
44
"fmt"
5+
"io"
56
"net/http"
67
"strconv"
8+
"strings"
79
"time"
810

911
"github.com/onsi/ginkgo/v2"
@@ -76,6 +78,13 @@ var _ = ginkgo.Describe("Run end to end tests", ginkgo.Ordered, func() {
7678

7779
epp := createEndPointPicker(pdConfig)
7880

81+
metricsURL := fmt.Sprintf("http://localhost:%s/metrics", metricsPort)
82+
83+
if k8sContext != "" {
84+
// Use port-forward to access the EPP pod's metrics endpoint.
85+
startEPPMetricsPortForward()
86+
}
87+
7988
prefillPods, decodePods := getModelServerPods(podSelector, prefillSelector, decodeSelector)
8089
gomega.Expect(prefillPods).Should(gomega.HaveLen(prefillReplicas))
8190
gomega.Expect(decodePods).Should(gomega.HaveLen(decodeReplicas))
@@ -110,6 +119,16 @@ var _ = ginkgo.Describe("Run end to end tests", ginkgo.Ordered, func() {
110119
gomega.Expect(podHdr).Should(gomega.BeElementOf(decodePods))
111120
gomega.Expect(podHdr).Should(gomega.Equal(podHdrChat))
112121

122+
// Metrics Validation
123+
labelFilter := fmt.Sprintf(`decision_type="prefill-decode",model_name="%s"`, modelName)
124+
prefillDecodeCount := getCounterMetric(metricsURL, "llm_d_inference_scheduler_pd_decision_total", labelFilter)
125+
126+
labelFilter2 := fmt.Sprintf(`decision_type="decode-only",model_name="%s"`, modelName)
127+
decodeOnlyCount := getCounterMetric(metricsURL, "llm_d_inference_scheduler_pd_decision_total", labelFilter2)
128+
129+
gomega.Expect(prefillDecodeCount).Should(gomega.Equal(6))
130+
gomega.Expect(decodeOnlyCount).Should(gomega.Equal(0))
131+
113132
testutils.DeleteObjects(testConfig, epp)
114133
testutils.DeleteObjects(testConfig, modelServers)
115134
})
@@ -383,6 +402,33 @@ func runChatCompletion(prompt string) (string, string, string) {
383402
return namespaceHeader, podHeader, podPort
384403
}
385404

405+
// getCounterMetric fetches the current value of a Prometheus counter metric from the given metrics URL.
406+
func getCounterMetric(metricsURL, metricName, labelMatch string) int {
407+
resp, err := http.Get(metricsURL)
408+
gomega.Expect(err).ShouldNot(gomega.HaveOccurred())
409+
defer func() {
410+
err = resp.Body.Close()
411+
gomega.Expect(err).ToNot(gomega.HaveOccurred())
412+
}()
413+
gomega.Expect(resp.StatusCode).Should(gomega.Equal(http.StatusOK))
414+
415+
body, err := io.ReadAll(resp.Body)
416+
gomega.Expect(err).ShouldNot(gomega.HaveOccurred())
417+
418+
metricsText := string(body)
419+
for _, line := range strings.Split(metricsText, "\n") {
420+
if strings.HasPrefix(line, metricName) && strings.Contains(line, labelMatch) {
421+
fields := strings.Fields(line)
422+
if len(fields) >= 2 {
423+
valFloat, err := strconv.ParseFloat(fields[len(fields)-1], 64)
424+
gomega.Expect(err).ShouldNot(gomega.HaveOccurred())
425+
return int(valFloat)
426+
}
427+
}
428+
}
429+
return 0
430+
}
431+
386432
// Simple EPP configuration for running without P/D
387433
const simpleConfig = `apiVersion: inference.networking.x-k8s.io/v1alpha1
388434
kind: EndpointPickerConfig

test/e2e/yaml/deployments.yaml

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -35,6 +35,7 @@ spec:
3535
- "9003"
3636
- --config-file
3737
- "/etc/epp/epp-config.yaml"
38+
- --metrics-endpoint-auth=false
3839
env:
3940
- name: PYTHONHASHSEED
4041
value: "42"

test/e2e/yaml/services.yaml

Lines changed: 15 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -33,3 +33,18 @@ spec:
3333
nodePort: 30081
3434
appProtocol: http2
3535
type: NodePort
36+
---
37+
apiVersion: v1
38+
kind: Service
39+
metadata:
40+
name: e2e-epp-metrics
41+
spec:
42+
selector:
43+
app: e2e-epp
44+
ports:
45+
- name: metrics
46+
protocol: TCP
47+
port: 9090
48+
targetPort: 9090
49+
nodePort: 32090
50+
type: NodePort

0 commit comments

Comments
 (0)