Skip to content

Commit 7e50ccd

Browse files
carlydfclaude
andcommitted
Remove worker_version label_replace workaround for Temporal Cloud backlog metrics
Temporal Cloud now emits temporal_worker_deployment_name and temporal_worker_build_id as separate labels, so the label_replace recording rule and adapter name alias are no longer needed. The adapter queries temporal_cloud_v1_approximate_backlog_count directly. Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
1 parent fdada2a commit 7e50ccd

9 files changed

Lines changed: 28 additions & 72 deletions

api/v1alpha1/workerresourcetemplate_webhook_test.go

Lines changed: 6 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -253,7 +253,7 @@ func TestWorkerResourceTemplate_ValidateCreate(t *testing.T) {
253253
"type": "External",
254254
"external": map[string]interface{}{
255255
"metric": map[string]interface{}{
256-
"name": "temporal_backlog_count_by_version",
256+
"name": "temporal_cloud_v1_approximate_backlog_count",
257257
"selector": map[string]interface{}{
258258
"matchLabels": map[string]interface{}{
259259
"task_type": "Activity",
@@ -282,7 +282,7 @@ func TestWorkerResourceTemplate_ValidateCreate(t *testing.T) {
282282
"type": "External",
283283
"external": map[string]interface{}{
284284
"metric": map[string]interface{}{
285-
"name": "temporal_backlog_count_by_version",
285+
"name": "temporal_cloud_v1_approximate_backlog_count",
286286
"selector": map[string]interface{}{
287287
"matchLabels": map[string]interface{}{
288288
"temporal_worker_deployment_name": "default_my-worker",
@@ -309,7 +309,7 @@ func TestWorkerResourceTemplate_ValidateCreate(t *testing.T) {
309309
"type": "External",
310310
"external": map[string]interface{}{
311311
"metric": map[string]interface{}{
312-
"name": "temporal_backlog_count_by_version",
312+
"name": "temporal_cloud_v1_approximate_backlog_count",
313313
"selector": map[string]interface{}{
314314
"matchLabels": map[string]interface{}{
315315
"temporal_worker_build_id": "abc123",
@@ -336,7 +336,7 @@ func TestWorkerResourceTemplate_ValidateCreate(t *testing.T) {
336336
"type": "External",
337337
"external": map[string]interface{}{
338338
"metric": map[string]interface{}{
339-
"name": "temporal_backlog_count_by_version",
339+
"name": "temporal_cloud_v1_approximate_backlog_count",
340340
"selector": map[string]interface{}{
341341
"matchLabels": map[string]interface{}{
342342
"temporal_namespace": "my-ns",
@@ -363,7 +363,7 @@ func TestWorkerResourceTemplate_ValidateCreate(t *testing.T) {
363363
"type": "External",
364364
"external": map[string]interface{}{
365365
"metric": map[string]interface{}{
366-
"name": "temporal_backlog_count_by_version",
366+
"name": "temporal_cloud_v1_approximate_backlog_count",
367367
"selector": map[string]interface{}{
368368
"matchLabels": map[string]interface{}{},
369369
},
@@ -390,7 +390,7 @@ func TestWorkerResourceTemplate_ValidateCreate(t *testing.T) {
390390
"type": "External",
391391
"external": map[string]interface{}{
392392
"metric": map[string]interface{}{
393-
"name": "temporal_backlog_count_by_version",
393+
"name": "temporal_cloud_v1_approximate_backlog_count",
394394
"selector": map[string]interface{}{},
395395
},
396396
"target": map[string]interface{}{

docs/worker-resource-templates.md

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -130,7 +130,7 @@ spec:
130130
- type: External
131131
external:
132132
metric:
133-
name: temporal_backlog_count_by_version
133+
name: temporal_cloud_v1_approximate_backlog_count
134134
selector:
135135
matchLabels:
136136
task_type: "Activity"

examples/wrt-hpa-backlog.yaml

Lines changed: 5 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -18,7 +18,7 @@
1818
# helm upgrade prometheus-adapter prometheus-community/prometheus-adapter \
1919
# -n monitoring -f internal/demo/k8s/prometheus-adapter-values.yaml
2020
# 3. Verify the backlog metric is flowing:
21-
# # Port-forward Prometheus and query: temporal_backlog_count_by_version
21+
# # Port-forward Prometheus and query: temporal_cloud_v1_approximate_backlog_count
2222
#
2323
# Apply:
2424
# kubectl apply -f examples/wrt-hpa-backlog.yaml
@@ -61,12 +61,15 @@ spec:
6161
value: "750m"
6262

6363
# Metric: backlog count — scale up when tasks are queued but not yet picked up.
64+
# Temporal Cloud emits temporal_cloud_v1_approximate_backlog_count with
65+
# temporal_worker_deployment_name and temporal_worker_build_id as separate labels;
66+
# prometheus-adapter exposes it as temporal_cloud_v1_approximate_backlog_count.
6467
# temporal_worker_deployment_name, temporal_worker_build_id, and temporal_namespace are
6568
# appended automatically by the controller — do not set them here.
6669
- type: External
6770
external:
6871
metric:
69-
name: temporal_backlog_count_by_version
72+
name: temporal_cloud_v1_approximate_backlog_count
7073
selector:
7174
matchLabels:
7275
task_type: "Activity"

internal/demo/README.md

Lines changed: 2 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -262,14 +262,6 @@ Stop the load generator (`Ctrl-C`) and watch the HPA scale back down as in-fligh
262262
263263
`approximate_backlog_count` measures tasks queued in Temporal but not yet started on a worker. Adding it as a second HPA metric means the HPA scales up on *arriving* work even before slots are full — important for bursty traffic.
264264
265-
> **Note:** Temporal Cloud emits `temporal_approximate_backlog_count` with a combined
266-
> `worker_version="<worker-deployment-name>_<build-id>"` label that easily exceeds Kubernetes max label
267-
> length of 63 characters. The recording rule in `prometheus-stack-values.yaml` uses `label_replace`
268-
> to extract `temporal_worker_deployment_name` and `temporal_worker_build_id` as separate k8s-compatible
269-
> labels, producing `temporal_backlog_count_by_version`. The HPA then selects on those labels — the same
270-
> pair used by Phase 1. Temporal Cloud is in the process of rolling out the new separate labels, so this
271-
> workaround is required until then.
272-
273265
**Step 1 — Create the Temporal Cloud credentials secret.**
274266
275267
Create a Temporal Cloud metrics API key (separate from the namespace API key) at Cloud UI → Settings → Observability → Generate API Key. Save it to `certs/metrics-api-key.txt`, then create the secret in the `monitoring` namespace:
@@ -294,11 +286,11 @@ helm upgrade prometheus-adapter prometheus-community/prometheus-adapter \
294286
295287
```bash
296288
kubectl -n monitoring port-forward svc/prometheus-kube-prometheus-prometheus 9092:9090 &
297-
curl -s 'http://localhost:9092/api/v1/query?query=temporal_backlog_count_by_version' \
289+
curl -s 'http://localhost:9092/api/v1/query?query=temporal_cloud_v1_approximate_backlog_count' \
298290
| jq '.data.result'
299291
```
300292
301-
You should see a result with `twd_name` and `build_id` labels. If the result is empty, wait 15–30s for the recording rule to evaluate.
293+
You should see results with `temporal_worker_deployment_name` and `temporal_worker_build_id` labels. If the result is empty, wait 10–15s for the first scrape to complete.
302294
303295
**Step 4 — Apply the combined WRT.**
304296
```bash

internal/demo/k8s/grafana-dashboard.json

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -127,7 +127,7 @@
127127
},
128128
"targets": [
129129
{
130-
"expr": "temporal_backlog_count_by_version{task_type=\"Workflow\", temporal_worker_deployment_name=\"default_helloworld\"}",
130+
"expr": "temporal_cloud_v1_approximate_backlog_count{task_type=\"Workflow\", temporal_worker_deployment_name=\"default_helloworld\"}",
131131
"legendFormat": "{{temporal_worker_deployment_name}} / {{temporal_worker_build_id}}",
132132
"refId": "A"
133133
}
@@ -152,7 +152,7 @@
152152
},
153153
"targets": [
154154
{
155-
"expr": "temporal_backlog_count_by_version{task_type=\"Activity\", temporal_worker_deployment_name=\"default_helloworld\"}",
155+
"expr": "temporal_cloud_v1_approximate_backlog_count{task_type=\"Activity\", temporal_worker_deployment_name=\"default_helloworld\"}",
156156
"legendFormat": "{{temporal_worker_deployment_name}} / {{temporal_worker_build_id}}",
157157
"refId": "A"
158158
}

internal/demo/k8s/prometheus-adapter-values.yaml

Lines changed: 4 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -33,14 +33,11 @@ rules:
3333
namespaced: false # cluster-scoped: HPAs in any k8s namespace can consume this metric
3434

3535
# Phase 2: approximate backlog count per worker version (from Temporal Cloud).
36-
# Uses the temporal_backlog_count_by_version recording rule.
37-
# cluster-scoped so HPAs in any namespace can consume it; temporal_worker_deployment_name
38-
# + temporal_worker_build_id + temporal_namespace matchLabels in the HPA are sufficient to
39-
# select the right series.
40-
- seriesQuery: 'temporal_backlog_count_by_version{}'
36+
# Temporal Cloud emits temporal_worker_deployment_name and temporal_worker_build_id as
37+
# separate labels, so the raw metric can be used directly — no recording rule needed.
38+
# cluster-scoped so HPAs in any namespace can consume it.
39+
- seriesQuery: 'temporal_cloud_v1_approximate_backlog_count{}'
4140
metricsQuery: 'sum(<<.Series>>{<<.LabelMatchers>>})'
42-
name:
43-
as: "temporal_backlog_count_by_version"
4441
resources:
4542
namespaced: false # cluster-scoped: HPAs in any namespace can consume this metric
4643

internal/demo/k8s/prometheus-stack-values.yaml

Lines changed: 4 additions & 40 deletions
Original file line numberDiff line numberDiff line change
@@ -48,7 +48,10 @@ prometheus:
4848
metrics_path: /v1/metrics
4949
params:
5050
labels:
51-
- worker_version
51+
- temporal_worker_deployment_name
52+
- temporal_worker_build_id
53+
- temporal_namespace
54+
- task_type
5255

5356
# ─── 3. Recording Rules ─────────────────────────────────────────────────────
5457

@@ -80,43 +83,4 @@ additionalPrometheusRulesMap:
8083
1
8184
)
8285
83-
- name: temporal_cloud_backlog
84-
interval: 10s
85-
rules:
86-
# Backlog count per worker version, shaped to match the label format that
87-
# Temporal Cloud will emit natively in a future release. This recording rule
88-
# is a temporary shim: once Temporal Cloud emits temporal_worker_deployment_name and
89-
# temporal_worker_build_id as separate labels, this rule can be deleted with no
90-
# other changes. Note: this rule only works with Build IDs that don't have underscores.
91-
#
92-
# Current Temporal Cloud label:
93-
# worker_version="{k8s-namespace}_{workerdeployment-name}_{build-id}"
94-
#
95-
# k8s namespaces and WorkerDeployment names follow DNS naming rules (no underscores), so _
96-
# is an unambiguous separator and label_replace is reliable.
97-
#
98-
# Extracted labels (matching future Temporal Cloud output):
99-
# temporal_worker_deployment_name — "{k8s-namespace}_{workerdeployment-name}" (all but last segment)
100-
# temporal_worker_build_id — last segment; ≤63 chars, valid k8s label value
101-
# temporal_namespace — preserved from source metric
102-
#
103-
# Produced labels:
104-
# temporal_worker_deployment_name — inner label_replace: all but the last _ segment
105-
# temporal_worker_build_id — outer label_replace: last _ segment
106-
# temporal_namespace — preserved from source metric
107-
# task_type — preserved as a dimension; HPAs can filter by
108-
# task_type="Activity", task_type="Workflow", etc.
109-
# The prometheus-adapter serves this as a cluster-scoped external metric
110-
# (namespaced: false), so HPAs in any namespace can consume it.
111-
- record: temporal_backlog_count_by_version
112-
expr: |
113-
label_replace(
114-
label_replace(
115-
sum by (worker_version, task_type, temporal_namespace) (
116-
temporal_cloud_v1_approximate_backlog_count{worker_version=~".+_.+_.+"}
117-
),
118-
"temporal_worker_deployment_name", "$1", "worker_version", "^(.+)_[^_]+$"
119-
),
120-
"temporal_worker_build_id", "$1", "worker_version", ".*_([^_]+)$"
121-
)
12286

internal/k8s/workerresourcetemplates_test.go

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -180,7 +180,7 @@ func TestAutoInjectFields_MatchLabels(t *testing.T) {
180180
"type": "External",
181181
"external": map[string]interface{}{
182182
"metric": map[string]interface{}{
183-
"name": "temporal_backlog_count_by_version",
183+
"name": "temporal_cloud_v1_approximate_backlog_count",
184184
"selector": map[string]interface{}{
185185
"matchLabels": map[string]interface{}{}, // opt-in for metric selector
186186
},
@@ -223,7 +223,7 @@ func TestAutoInjectFields_MetricSelector(t *testing.T) {
223223
"type": "External",
224224
"external": map[string]interface{}{
225225
"metric": map[string]interface{}{
226-
"name": "temporal_backlog_count_by_version",
226+
"name": "temporal_cloud_v1_approximate_backlog_count",
227227
"selector": map[string]interface{}{
228228
"matchLabels": matchLabels,
229229
},
@@ -259,7 +259,7 @@ func TestAutoInjectFields_MetricSelector(t *testing.T) {
259259
"type": "External",
260260
"external": map[string]interface{}{
261261
"metric": map[string]interface{}{
262-
"name": "temporal_backlog_count_by_version",
262+
"name": "temporal_cloud_v1_approximate_backlog_count",
263263
"selector": map[string]interface{}{},
264264
},
265265
},

internal/tests/internal/wrt_integration_test.go

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -192,7 +192,7 @@ func wrtTestCases() []testCase {
192192
"type": "External",
193193
"external": {
194194
"metric": {
195-
"name": "temporal_backlog_count_by_version",
195+
"name": "temporal_cloud_v1_approximate_backlog_count",
196196
"selector": {
197197
"matchLabels": {
198198
"task_type": "Activity"

0 commit comments

Comments
 (0)