Skip to content

Commit 731dc35

Browse files
khrmclaude
andcommitted
test: add e2e test for OpenCensus to OpenTelemetry metrics migration
Adds TestOTelMetrics, a consolidated e2e test for the OC→OTel metrics migration in Pipelines-as-Code (PR #2567). The test scrapes two pods: Controller (app.kubernetes.io/name=controller): - Asserts http_client_* metrics from knative k8s client OTel instrumentation - Asserts go_* runtime metrics - Checks PAC application metrics (pipelines_as_code_*) are absent/present - Asserts old OC metric names are absent Watcher (app.kubernetes.io/name=watcher): - Asserts kn_workqueue_* metrics (watcher uses knative reconciler) - Asserts go_* runtime metrics Verified locally with PAC controller and watcher deployed to kind via ko. Relates to #2567 Co-Authored-By: Claude Sonnet 4.6 (1M context) <noreply@anthropic.com>
1 parent ffc5092 commit 731dc35

1 file changed

Lines changed: 335 additions & 0 deletions

File tree

test/metrics_otel_test.go

Lines changed: 335 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,335 @@
1+
//go:build e2e
2+
3+
// Copyright 2026 The Tekton Authors
4+
//
5+
// Licensed under the Apache License, Version 2.0 (the "License");
6+
// you may not use this file except in compliance with the License.
7+
// You may obtain a copy of the License at
8+
//
9+
// http://www.apache.org/licenses/LICENSE-2.0
10+
//
11+
// Unless required by applicable law or agreed to in writing, software
12+
// distributed under the License is distributed on an "AS IS" BASIS,
13+
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14+
// See the License for the specific language governing permissions and
15+
// limitations under the License.
16+
17+
package test
18+
19+
import (
20+
"context"
21+
"fmt"
22+
"strings"
23+
"testing"
24+
"time"
25+
26+
"github.com/openshift-pipelines/pipelines-as-code/pkg/params/triggertype"
27+
tgitea "github.com/openshift-pipelines/pipelines-as-code/test/pkg/gitea"
28+
29+
dto "github.com/prometheus/client_model/go"
30+
"github.com/prometheus/common/expfmt"
31+
"github.com/prometheus/common/model"
32+
corev1 "k8s.io/api/core/v1"
33+
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
34+
"k8s.io/client-go/kubernetes"
35+
"k8s.io/client-go/tools/clientcmd"
36+
)
37+
38+
const (
39+
pacNamespace = "pipelines-as-code"
40+
pacMetricsPort = "9090"
41+
pacControllerSelector = "app.kubernetes.io/name=controller,app.kubernetes.io/part-of=pipelines-as-code"
42+
pacWatcherSelector = "app.kubernetes.io/name=watcher,app.kubernetes.io/part-of=pipelines-as-code"
43+
)
44+
45+
// pacKubeClient builds a kubernetes client from the default kubeconfig.
46+
func pacKubeClient(t *testing.T) kubernetes.Interface {
47+
t.Helper()
48+
rules := clientcmd.NewDefaultClientConfigLoadingRules()
49+
cfg, err := clientcmd.NewNonInteractiveDeferredLoadingClientConfig(rules, &clientcmd.ConfigOverrides{}).ClientConfig()
50+
if err != nil {
51+
t.Fatalf("Failed to build kubeconfig: %v", err)
52+
}
53+
return kubernetes.NewForConfigOrDie(cfg)
54+
}
55+
56+
// scrapePACPodMetrics scrapes /metrics from the first Running/Ready pod
57+
// matching labelSelector via the Kubernetes API proxy. Returns an error
58+
// so callers can retry on transient failures without aborting the test.
59+
func scrapePACPodMetrics(ctx context.Context, kubeClient kubernetes.Interface, labelSelector string) (map[string]*dto.MetricFamily, error) {
60+
pods, err := kubeClient.CoreV1().Pods(pacNamespace).List(ctx, metav1.ListOptions{
61+
LabelSelector: labelSelector,
62+
})
63+
if err != nil {
64+
return nil, err
65+
}
66+
67+
var podName string
68+
for _, pod := range pods.Items {
69+
if pod.Status.Phase != corev1.PodRunning {
70+
continue
71+
}
72+
podReady := len(pod.Status.ContainerStatuses) > 0
73+
for _, cs := range pod.Status.ContainerStatuses {
74+
if !cs.Ready {
75+
podReady = false
76+
break
77+
}
78+
}
79+
if podReady {
80+
podName = pod.Name
81+
break
82+
}
83+
}
84+
if podName == "" {
85+
return nil, fmt.Errorf("no Running/Ready PAC pod found for selector %q in namespace %s", labelSelector, pacNamespace)
86+
}
87+
88+
result := kubeClient.CoreV1().RESTClient().Get().
89+
Resource("pods").
90+
Name(podName + ":" + pacMetricsPort).
91+
Namespace(pacNamespace).
92+
SubResource("proxy").
93+
Suffix("metrics").
94+
Do(ctx)
95+
96+
body, err := result.Raw()
97+
if err != nil {
98+
return nil, err
99+
}
100+
101+
parser := expfmt.NewTextParser(model.LegacyValidation)
102+
families, err := parser.TextToMetricFamilies(strings.NewReader(string(body)))
103+
if err != nil {
104+
return nil, err
105+
}
106+
return families, nil
107+
}
108+
109+
// waitForControllerMetrics polls the PAC controller pod until the named
110+
// metric appears. Transient errors are logged and retried until timeout.
111+
func waitForControllerMetrics(ctx context.Context, t *testing.T, kubeClient kubernetes.Interface, metricName string, timeout time.Duration) map[string]*dto.MetricFamily {
112+
t.Helper()
113+
return waitForPACPodMetric(ctx, t, kubeClient, pacControllerSelector, metricName, timeout)
114+
}
115+
116+
// waitForWatcherMetrics polls the PAC watcher pod until the named metric
117+
// appears. Transient errors are logged and retried until timeout.
118+
func waitForWatcherMetrics(ctx context.Context, t *testing.T, kubeClient kubernetes.Interface, metricName string, timeout time.Duration) map[string]*dto.MetricFamily {
119+
t.Helper()
120+
return waitForPACPodMetric(ctx, t, kubeClient, pacWatcherSelector, metricName, timeout)
121+
}
122+
123+
// waitForPACPodMetric is the shared polling implementation used by
124+
// waitForControllerMetrics and waitForWatcherMetrics.
125+
func waitForPACPodMetric(ctx context.Context, t *testing.T, kubeClient kubernetes.Interface, labelSelector, metricName string, timeout time.Duration) map[string]*dto.MetricFamily {
126+
t.Helper()
127+
ctx, cancel := context.WithTimeout(ctx, timeout)
128+
defer cancel()
129+
for {
130+
families, err := scrapePACPodMetrics(ctx, kubeClient, labelSelector)
131+
if err == nil {
132+
if _, ok := families[metricName]; ok {
133+
return families
134+
}
135+
} else {
136+
t.Logf("Retrying metrics scrape (%s): %v", labelSelector, err)
137+
}
138+
select {
139+
case <-ctx.Done():
140+
t.Fatalf("Timed out waiting for metric %q (selector=%s, waited %v): %v", metricName, labelSelector, timeout, ctx.Err())
141+
return nil
142+
case <-time.After(5 * time.Second):
143+
}
144+
}
145+
}
146+
147+
// counterValue returns the sum of all counter values for the given metric name.
148+
func counterValue(families map[string]*dto.MetricFamily, name string) float64 {
149+
fam, ok := families[name]
150+
if !ok {
151+
return 0
152+
}
153+
var total float64
154+
for _, m := range fam.GetMetric() {
155+
if c := m.GetCounter(); c != nil {
156+
total += c.GetValue()
157+
}
158+
}
159+
return total
160+
}
161+
162+
// TestOthersOTelMetricsController verifies that the PAC controller pod exposes the
163+
// expected OTel metric families after the OC→OTel migration (PR #2567):
164+
// - http_client_* and kn_k8s_client_* (knative k8s client OTel instrumentation)
165+
// - go_* runtime metrics
166+
// - PAC application metrics logged (appear only after first PipelineRun)
167+
// - Old OpenCensus metric names absent
168+
func TestOthersOTelMetricsController(t *testing.T) {
169+
ctx := context.Background()
170+
kubeClient := pacKubeClient(t)
171+
172+
t.Log("Waiting for PAC controller metrics (http_client_request_duration_seconds)")
173+
families := waitForControllerMetrics(ctx, t, kubeClient, "http_client_request_duration_seconds", 2*time.Minute)
174+
t.Logf("Scraped %d metric families from PAC controller", len(families))
175+
176+
tests := []struct {
177+
name string
178+
prefix string
179+
errMsg string
180+
}{
181+
{
182+
name: "http_client_prefix",
183+
prefix: "http_client_",
184+
errMsg: "Expected at least one http_client_* metric from knative k8s client instrumentation, found none",
185+
},
186+
{
187+
name: "kn_k8s_client_prefix",
188+
prefix: "kn_k8s_client_",
189+
errMsg: "Expected at least one kn_k8s_client_* metric from knative k8s client instrumentation, found none",
190+
},
191+
{
192+
name: "go_runtime_prefix",
193+
prefix: "go_",
194+
errMsg: "Expected standard go_* runtime metrics, found none",
195+
},
196+
}
197+
for _, tt := range tests {
198+
t.Run(tt.name, func(t *testing.T) {
199+
for name := range families {
200+
if strings.HasPrefix(name, tt.prefix) {
201+
return
202+
}
203+
}
204+
t.Error(tt.errMsg)
205+
})
206+
}
207+
208+
// Old OC metric names must be absent.
209+
// TODO: Remove in a future release once no OC-based release is supported.
210+
for name := range families {
211+
for _, prefix := range []string{"pipelines_as_code/", "tekton_pipelines_as_code_"} {
212+
if strings.HasPrefix(name, prefix) {
213+
t.Errorf("Old OC metric %q still present; expected removal after OTel migration", name)
214+
}
215+
}
216+
}
217+
218+
}
219+
220+
// TestOthersOTelMetricsWatcher verifies that the PAC watcher pod exposes the
221+
// expected OTel metric families after the OC→OTel migration (PR #2567):
222+
// - kn_workqueue_* (knative reconciler workqueue)
223+
// - http_client_* and kn_k8s_client_* (knative k8s client OTel instrumentation)
224+
// - go_* runtime metrics
225+
func TestOthersOTelMetricsWatcher(t *testing.T) {
226+
ctx := context.Background()
227+
kubeClient := pacKubeClient(t)
228+
229+
t.Log("Waiting for PAC watcher metrics (go_goroutines)")
230+
families := waitForWatcherMetrics(ctx, t, kubeClient, "go_goroutines", 2*time.Minute)
231+
t.Logf("Scraped %d metric families from PAC watcher", len(families))
232+
233+
tests := []struct {
234+
name string
235+
prefix string
236+
errMsg string
237+
}{
238+
{
239+
name: "kn_workqueue_prefix",
240+
prefix: "kn_workqueue_",
241+
errMsg: "Expected at least one kn_workqueue_* metric on the PAC watcher, found none",
242+
},
243+
{
244+
name: "http_client_prefix",
245+
prefix: "http_client_",
246+
errMsg: "Expected at least one http_client_* metric on the PAC watcher, found none",
247+
},
248+
{
249+
name: "kn_k8s_client_prefix",
250+
prefix: "kn_k8s_client_",
251+
errMsg: "Expected at least one kn_k8s_client_* metric on the PAC watcher, found none",
252+
},
253+
{
254+
name: "go_runtime_prefix",
255+
prefix: "go_",
256+
errMsg: "Expected standard go_* runtime metrics on PAC watcher, found none",
257+
},
258+
}
259+
for _, tt := range tests {
260+
t.Run(tt.name, func(t *testing.T) {
261+
for name := range families {
262+
if strings.HasPrefix(name, tt.prefix) {
263+
return
264+
}
265+
}
266+
t.Error(tt.errMsg)
267+
})
268+
}
269+
}
270+
271+
// TestOthersOTelMetricsAfterPACRun triggers a real PAC PipelineRun via Gitea
272+
// and asserts that pipelines_as_code_pipelinerun_count increments by
273+
// exactly 1. The test skips automatically if Gitea is not configured
274+
// (TEST_GITEA_API_URL and related env vars are unset).
275+
func TestOthersOTelMetricsAfterPACRun(t *testing.T) {
276+
ctx := context.Background()
277+
kubeClient := pacKubeClient(t)
278+
279+
// Baseline before the PAC run to compute an exact delta.
280+
baseline, err := scrapePACPodMetrics(ctx, kubeClient, pacControllerSelector)
281+
if err != nil {
282+
t.Skipf("PAC controller metrics not reachable, skipping: %v", err)
283+
}
284+
baseCount := counterValue(baseline, "pipelines_as_code_pipelinerun_count")
285+
286+
// TestPR sets up Gitea, creates a repo, pushes .tekton/pr.yaml, creates a
287+
// PR, waits for PAC to process it. It calls t.Skip if Gitea is not
288+
// configured via TEST_GITEA_API_URL / TEST_GITEA_PASSWORD env vars.
289+
topts := &tgitea.TestOpts{
290+
Regexp: successRegexp,
291+
TargetEvent: triggertype.PullRequest.String(),
292+
YAMLFiles: map[string]string{".tekton/pr.yaml": "testdata/always-good-pipelinerun.yaml"},
293+
CheckForStatus: "success",
294+
}
295+
_, f := tgitea.TestPR(t, topts)
296+
defer f()
297+
298+
// Assert exact delta == 1 (one PipelineRun processed by PAC).
299+
after := waitForControllerMetrics(ctx, t, kubeClient, "pipelines_as_code_pipelinerun_count", 2*time.Minute)
300+
delta := counterValue(after, "pipelines_as_code_pipelinerun_count") - baseCount
301+
if delta != 1 {
302+
t.Errorf("pipelinerun_count delta = %v, want exactly 1", delta)
303+
}
304+
t.Logf("pipelines_as_code_pipelinerun_count delta: %v", delta)
305+
306+
// Assert all PAC application metrics are present after a real run.
307+
appTests := []struct {
308+
name string
309+
metricName string
310+
errMsg string
311+
}{
312+
{
313+
name: "pipelinerun_count",
314+
metricName: "pipelines_as_code_pipelinerun_count",
315+
errMsg: "pipelines_as_code_pipelinerun_count not found after PAC run",
316+
},
317+
{
318+
name: "pipelinerun_duration_seconds_sum",
319+
metricName: "pipelines_as_code_pipelinerun_duration_seconds_sum",
320+
errMsg: "pipelines_as_code_pipelinerun_duration_seconds_sum not found after PAC run",
321+
},
322+
{
323+
name: "git_provider_api_request_count",
324+
metricName: "pipelines_as_code_git_provider_api_request_count",
325+
errMsg: "pipelines_as_code_git_provider_api_request_count not found after PAC run",
326+
},
327+
}
328+
for _, tt := range appTests {
329+
t.Run(tt.name, func(t *testing.T) {
330+
if _, ok := after[tt.metricName]; !ok {
331+
t.Error(tt.errMsg)
332+
}
333+
})
334+
}
335+
}

0 commit comments

Comments
 (0)