Skip to content

Commit 5f00a13

Browse files
carlydfclaudeShivs11
authored
feat: replace domain conditions with standard Ready/Progressing conditions (#235)
<!--- Note to EXTERNAL Contributors --> <!-- Thanks for opening a PR! If it is a significant code change, please **make sure there is an open issue** for this. We work best with you when we have accepted the idea first before you code. --> <!--- For ALL Contributors 👇 --> ## What was changed Deprecate the controller-specific TemporalConnectionHealthy and RolloutComplete conditions with the two standard Kubernetes conditions (Ready, Progressing) that GitOps tools like ArgoCD and Flux can consume without custom parsing logic. - Ready=True when the target version is Current in Temporal - Progressing=True while a rollout is in-flight (WaitingForPollers, WaitingForPromotion, Ramping) - Progressing=False on any blocking error (connection failures, plan errors), regardless of whether a rollout is active Also adds docs/cd-rollouts.md with integration guidance for kubectl, Helm 3/4, ArgoCD (Lua health check), and Flux (Kustomization + HelmRelease). ## Why? Previous conditions were not easy for CD systems to consume. They were only merged to main, not included in any release, so we can still change them. ## Checklist <!--- add/delete as needed ---> 1. Closes #198 2. How was this tested: Functional and unit tests. 3. Any docs updates needed? <!--- update README if applicable or point out where to update docs.temporal.io --> --------- Co-authored-by: Claude Sonnet 4.6 <noreply@anthropic.com> Co-authored-by: Shivam Saraf <shivam.saraf@temporal.io> Co-authored-by: Shivam <57200924+Shivs11@users.noreply.github.com>
1 parent 6908663 commit 5f00a13

8 files changed

Lines changed: 473 additions & 79 deletions

File tree

README.md

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -104,6 +104,8 @@ See [docs/crd-management.md](docs/crd-management.md) for upgrade, rollback, and
104104

105105
**New to deploying workers with this controller?** → Start with our [Migration Guide](docs/migration-to-versioned.md) to learn how to safely transition from traditional deployments.
106106

107+
**Setting up CI/CD for steady-state rollouts?** → See the [CD Rollouts Guide](docs/cd-rollouts.md) for Helm, kubectl, ArgoCD, and Flux integration patterns.
108+
107109
**Ready to dive deeper?** → Check out the [Architecture Guide](docs/architecture.md) to understand how the controller works, or the [Temporal Worker Versioning docs](https://docs.temporal.io/production-deployment/worker-deployments/worker-versioning) to learn about the underlying Temporal feature.
108110

109111
**Need configuration help?** → See the [Configuration Reference](docs/configuration.md) for all available options.
@@ -145,6 +147,7 @@ The Temporal Worker Controller eliminates this operational overhead by automatin
145147
|----------|-------------|
146148
| [Migration Guide](docs/migration-to-versioned.md) | Step-by-step guide for migrating from traditional deployments |
147149
| [Reversion Guide](docs/migration-to-unversioned.md) | Step-by-step guide for migrating back to unversioned deployment |
150+
| [CD Rollouts](docs/cd-rollouts.md) | Helm, kubectl, ArgoCD, and Flux integration for steady-state rollouts |
148151
| [Architecture](docs/architecture.md) | Technical deep-dive into how the controller works |
149152
| [Configuration](docs/configuration.md) | Complete configuration reference |
150153
| [Concepts](docs/concepts.md) | Key concepts and terminology |

api/v1alpha1/worker_types.go

Lines changed: 47 additions & 22 deletions
Original file line numberDiff line numberDiff line change
@@ -88,12 +88,24 @@ type TemporalWorkerDeploymentSpec struct {
8888

8989
// Condition type constants for TemporalWorkerDeployment.
9090
const (
91-
// ConditionTemporalConnectionHealthy indicates whether the referenced TemporalConnection
92-
// resource exists and is properly configured.
91+
// ConditionReady is True when the Temporal connection is reachable and the
92+
// target version is the current version in Temporal. CD systems such as
93+
// ArgoCD and Flux use this condition to gate deployment success.
94+
ConditionReady = "Ready"
95+
96+
// ConditionProgressing is True while a rollout is actively in-flight —
97+
// i.e., the target version has not yet been promoted to current.
98+
ConditionProgressing = "Progressing"
99+
)
100+
101+
// Deprecated condition type constants. Maintained for backward compatibility with
102+
// monitoring and automation built against v1.3.x. Use Ready and Progressing
103+
// instead. These will be removed in the next major version of the CRD.
104+
const (
105+
// Deprecated: Use ConditionReady and ConditionProgressing instead.
93106
ConditionTemporalConnectionHealthy = "TemporalConnectionHealthy"
94107

95-
// ConditionRolloutComplete indicates whether the target version has been successfully
96-
// registered as the current version, completing the rollout.
108+
// Deprecated: Use ConditionReady instead.
97109
ConditionRolloutComplete = "RolloutComplete"
98110
)
99111

@@ -104,33 +116,46 @@ const (
104116
// They should be treated as stable within an API version and renamed only with
105117
// a corresponding version bump.
106118
const (
107-
// ReasonTemporalConnectionNotFound is set on ConditionTemporalConnectionHealthy
108-
// when the referenced TemporalConnection resource cannot be found.
119+
// ReasonRolloutComplete is set on ConditionReady=True and ConditionProgressing=False
120+
// when the target version has been successfully registered as the current version.
121+
ReasonRolloutComplete = "RolloutComplete"
122+
123+
// ReasonWaitingForPollers is set on ConditionProgressing=True when the target
124+
// version's Kubernetes Deployment has been created but the version is not yet
125+
// registered with Temporal (workers have not started polling yet).
126+
ReasonWaitingForPollers = "WaitingForPollers"
127+
128+
// ReasonWaitingForPromotion is set on ConditionProgressing=True when the target
129+
// version is registered with Temporal (Inactive) but has not yet been promoted
130+
// to current or ramping.
131+
ReasonWaitingForPromotion = "WaitingForPromotion"
132+
133+
// ReasonRamping is set on ConditionProgressing=True when the target version is
134+
// the ramping version and is receiving a configured percentage of new workflows.
135+
ReasonRamping = "Ramping"
136+
137+
// ReasonTemporalConnectionNotFound is set on ConditionProgressing=False when the
138+
// referenced TemporalConnection resource cannot be found.
109139
ReasonTemporalConnectionNotFound = "TemporalConnectionNotFound"
110140

111-
// ReasonAuthSecretInvalid is set on ConditionTemporalConnectionHealthy when the
112-
// credential secret referenced by the TemporalConnection is misconfigured. This
113-
// covers: (1) the secret reference has an empty name, (2) the named Kubernetes
114-
// Secret cannot be fetched or has an unexpected type, and (3) the mTLS certificate
115-
// in the secret is expired or about to expire.
141+
// ReasonAuthSecretInvalid is set on ConditionProgressing=False when the credential
142+
// secret referenced by the TemporalConnection is misconfigured. This covers:
143+
// (1) the secret reference has an empty name, (2) the named Kubernetes Secret
144+
// cannot be fetched or has an unexpected type, and (3) the mTLS certificate in
145+
// the secret is expired or about to expire.
116146
ReasonAuthSecretInvalid = "AuthSecretInvalid"
117147

118-
// ReasonTemporalClientCreationFailed is set on ConditionTemporalConnectionHealthy
119-
// when the Temporal SDK client cannot connect to the server (dial failure or failed
120-
// health check). The credentials were valid; the server itself is unreachable.
148+
// ReasonTemporalClientCreationFailed is set on ConditionProgressing=False when the
149+
// Temporal SDK client cannot connect to the server (dial failure or failed health
150+
// check). The credentials were valid; the server itself is unreachable.
121151
ReasonTemporalClientCreationFailed = "TemporalClientCreationFailed"
122152

123-
// ReasonTemporalStateFetchFailed is set on ConditionTemporalConnectionHealthy
124-
// when the controller cannot query the current worker deployment state from Temporal.
153+
// ReasonTemporalStateFetchFailed is set on ConditionProgressing=False when the
154+
// controller cannot query the current worker deployment state from Temporal.
125155
ReasonTemporalStateFetchFailed = "TemporalStateFetchFailed"
126156

127-
// ReasonTemporalConnectionHealthy is set on ConditionTemporalConnectionHealthy
128-
// when the connection is reachable and the auth secret is resolved.
157+
// Deprecated: Use ReasonRolloutComplete on ConditionReady instead.
129158
ReasonTemporalConnectionHealthy = "TemporalConnectionHealthy"
130-
131-
// ReasonRolloutComplete is set on ConditionRolloutComplete when the target
132-
// version has been successfully registered as the current version.
133-
ReasonRolloutComplete = "RolloutComplete"
134159
)
135160

136161
// VersionStatus indicates the status of a version.

docs/README.md

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -11,6 +11,9 @@ This documentation structure is designed to support various types of technical d
1111

1212
## Index
1313

14+
### [CD Rollouts](cd-rollouts.md)
15+
How to integrate the controller into Helm, kubectl, ArgoCD, and Flux pipelines for steady-state rollouts once you are already using Worker Versioning.
16+
1417
### [Architecture](architecture.md)
1518
High-level overview of the Temporal Worker Controller architecture.
1619

docs/cd-rollouts.md

Lines changed: 209 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,209 @@
1+
# CD Rollouts with the Temporal Worker Controller
2+
3+
This guide describes patterns for integrating the Temporal Worker Controller into a CD pipeline, intended as guidance once you are already using Worker Versioning in steady state.
4+
5+
> **Note:** The examples below illustrate common integration patterns but are not guaranteed to work verbatim with every version of each tool. API fields, configuration keys, and default behaviors change between releases. Always verify against the documentation for the specific tool you are using.
6+
7+
For migration help, see [migration-to-versioned.md](migration-to-versioned.md).
8+
9+
## Understanding the conditions
10+
11+
The `TemporalWorkerDeployment` resource exposes two standard conditions on `status.conditions` that CD tools and scripts can consume.
12+
13+
### `Ready`
14+
15+
`Ready=True` means the controller successfully reached Temporal **and** the target version is the current version in Temporal. This is the primary signal that a rollout has finished and the worker is fully operational.
16+
17+
`Ready=True` with reason `RolloutComplete` when the rollout has finished.
18+
19+
`Ready=False` while either condition is not met. The `reason` field tells you why:
20+
21+
| Reason | Meaning |
22+
|---|---|
23+
| `WaitingForPollers` | Target version's Deployment exists but workers haven't registered with Temporal yet |
24+
| `WaitingForPromotion` | Workers are registered (Inactive) but not yet promoted to Current |
25+
| `Ramping` | Progressive strategy is ramping traffic to the new version |
26+
| Error reasons (see Progressing below) | A blocking error is preventing progress |
27+
28+
### `Progressing`
29+
30+
`Progressing=True` means a rollout is actively in-flight and the controller is making forward progress. `Progressing=False` means either the rollout is done (`Ready=True`) or a blocking error is preventing progress.
31+
32+
When `Progressing=False` due to an error, the `reason` field identifies what went wrong:
33+
34+
| Reason | Meaning |
35+
|---|---|
36+
| `RolloutComplete` | Not an error — the rollout finished successfully |
37+
| `TemporalConnectionNotFound` | The referenced `TemporalConnection` resource doesn't exist |
38+
| `AuthSecretInvalid` | The credential secret is missing, malformed, or has an expired certificate |
39+
| `TemporalClientCreationFailed` | The controller can't reach the Temporal server (dial/health-check failure) |
40+
| `TemporalStateFetchFailed` | The controller reached the server but can't read the worker deployment state |
41+
| `PlanGenerationFailed` | Internal error generating the reconciliation plan |
42+
| `PlanExecutionFailed` | Internal error executing the plan (e.g., a Kubernetes API call failed) |
43+
44+
Once the underlying problem is fixed, the next successful reconcile will restore `Progressing` and `Ready` to the correct state.
45+
46+
## Triggering a rollout
47+
48+
A rollout starts when you change the pod template in your `TemporalWorkerDeployment` spec — a changed pod spec produces a new Build ID, which the controller treats as a new version to roll out.
49+
50+
With Helm (image tag update):
51+
52+
```yaml
53+
# values.yaml
54+
image:
55+
repository: my-registry/my-worker
56+
tag: "v2.3.0"
57+
```
58+
59+
```bash
60+
helm upgrade my-worker ./chart --values values.yaml
61+
```
62+
63+
With a plain manifest:
64+
65+
```yaml
66+
# twd.yaml
67+
spec:
68+
template:
69+
spec:
70+
containers:
71+
- name: worker
72+
image: my-registry/my-worker:v2.3.0
73+
```
74+
75+
```bash
76+
kubectl apply -f twd.yaml
77+
```
78+
79+
The controller picks up the change on the next reconcile loop (within seconds) and begins the rollout.
80+
81+
## kubectl
82+
83+
`kubectl wait` can block a pipeline script until `Ready=True`:
84+
85+
```bash
86+
kubectl apply -f twd.yaml
87+
kubectl wait temporalworkerdeployment/my-worker \
88+
--for=condition=Ready \
89+
--timeout=10m
90+
```
91+
92+
Set `--timeout` to exceed the longest expected rollout time — for progressive strategies this is the sum of all `pauseDuration` values plus the time for workers to start and register. `kubectl wait` exits non-zero on timeout, which you can use to fail the pipeline.
93+
94+
## Helm
95+
96+
### Helm 4
97+
98+
Helm 4 uses [kstatus](https://github.com/kubernetes-sigs/cli-utils/tree/master/pkg/kstatus) for its `--wait` implementation ([HIP-0022](https://helm.sh/community/hips/hip-0022/)). kstatus understands the standard Kubernetes conditions contract and should block until `Ready=True` on your `TemporalWorkerDeployment`:
99+
100+
```bash
101+
helm upgrade my-worker ./chart --values values.yaml --wait --timeout 10m
102+
```
103+
104+
> **Verify:** Check your Helm 4 release notes — kstatus behavior and the `--wait` flag semantics have evolved across point releases.
105+
106+
### Helm 3
107+
108+
Helm 3's `--wait` only covers a hardcoded set of native resource types (Deployments, StatefulSets, DaemonSets, Jobs, Pods) and does not inspect conditions on custom resources. A separate `kubectl wait` step is one approach:
109+
110+
```bash
111+
helm upgrade my-worker ./chart --values values.yaml
112+
kubectl wait temporalworkerdeployment/my-worker \
113+
--for=condition=Ready \
114+
--timeout=10m \
115+
--namespace my-namespace
116+
```
117+
118+
## ArgoCD
119+
120+
ArgoCD does not have a generic fallback that automatically checks `status.conditions` on unknown CRD types. For any resource whose group (`temporal.io`) is not in ArgoCD's built-in health check registry, ArgoCD silently skips that resource when computing application health. A [custom Lua health check](https://argo-cd.readthedocs.io/en/stable/operator-manual/health/) is the standard mechanism for teaching ArgoCD how to assess a CRD's health.
121+
122+
The two standard conditions (`Ready`, `Progressing`) keep the Lua simple — it only needs to read the condition type and status, not any controller-specific status fields. The following script is a starting point; adapt it to your ArgoCD version and any site-specific requirements:
123+
124+
```yaml
125+
# In your argocd-cm ConfigMap
126+
data:
127+
resource.customizations.health.temporal.io_TemporalWorkerDeployment: |
128+
local ready = nil
129+
local progressing = nil
130+
if obj.status ~= nil and obj.status.conditions ~= nil then
131+
for _, c in ipairs(obj.status.conditions) do
132+
if c.type == "Ready" then ready = c end
133+
if c.type == "Progressing" then progressing = c end
134+
end
135+
end
136+
if ready ~= nil and ready.status == "True" then
137+
return {status = "Healthy", message = ready.message}
138+
end
139+
if progressing ~= nil then
140+
if progressing.status == "True" then
141+
return {status = "Progressing", message = progressing.message}
142+
else
143+
return {status = "Degraded", message = progressing.message}
144+
end
145+
end
146+
return {status = "Progressing", message = "Waiting for conditions"}
147+
```
148+
149+
With a health check like this in place:
150+
151+
- ArgoCD shows **Healthy** once `Ready=True`.
152+
- ArgoCD shows **Progressing** while a rollout is in-flight (`Progressing=True`).
153+
- ArgoCD shows **Degraded** when progress is blocked (`Progressing=False` with an error reason).
154+
155+
If you use [sync waves](https://argo-cd.readthedocs.io/en/stable/user-guide/sync-waves/) and workers must be fully rolled out before a dependent service is updated, place the `TemporalWorkerDeployment` in an earlier wave.
156+
157+
> **Verify:** ArgoCD's health customization API and Lua runtime have changed across versions. Test your health check script in a non-production environment before relying on it to gate sync waves.
158+
159+
## Flux
160+
161+
### Kustomization
162+
163+
Flux's `Kustomization` controller uses kstatus to assess resource health. Because `TemporalWorkerDeployment` emits a standard `Ready` condition, Flux should treat it as healthy when `Ready=True`. Adding an explicit `healthChecks` entry makes the dependency visible and ensures Flux waits on the `TemporalWorkerDeployment` before marking the Kustomization as ready:
164+
165+
```yaml
166+
apiVersion: kustomize.toolkit.fluxcd.io/v1
167+
kind: Kustomization
168+
metadata:
169+
name: my-workers
170+
namespace: flux-system
171+
spec:
172+
interval: 5m
173+
path: ./workers
174+
prune: true
175+
sourceRef:
176+
kind: GitRepository
177+
name: my-repo
178+
healthChecks:
179+
- apiVersion: temporal.io/v1alpha1
180+
kind: TemporalWorkerDeployment
181+
name: my-worker
182+
namespace: my-namespace
183+
timeout: 10m
184+
```
185+
186+
Set `timeout` to exceed the longest expected rollout duration.
187+
188+
### HelmRelease
189+
190+
Flux's `helm-controller` uses kstatus by default for post-install/post-upgrade health assessment, so a `HelmRelease` deploying your worker chart should automatically wait for `Ready=True` on any `TemporalWorkerDeployment` resources in the release:
191+
192+
```yaml
193+
apiVersion: helm.toolkit.fluxcd.io/v2
194+
kind: HelmRelease
195+
metadata:
196+
name: my-worker
197+
namespace: flux-system
198+
spec:
199+
interval: 5m
200+
timeout: 10m # should exceed the longest expected rollout
201+
chart:
202+
spec:
203+
chart: ./chart
204+
sourceRef:
205+
kind: GitRepository
206+
name: my-repo
207+
```
208+
209+
> **Verify:** kstatus integration details and the `healthChecks` API have evolved across Flux releases. Check the Flux documentation for your version.

internal/controller/execplan.go

Lines changed: 13 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -202,8 +202,11 @@ func (r *TemporalWorkerDeploymentReconciler) updateVersionConfig(ctx context.Con
202202
"Failed to set buildID %q as current version: %v", vcfg.BuildID, err)
203203
return fmt.Errorf("unable to set current deployment version: %w", err)
204204
}
205-
r.setCondition(workerDeploy, temporaliov1alpha1.ConditionRolloutComplete, metav1.ConditionTrue,
206-
temporaliov1alpha1.ReasonRolloutComplete, fmt.Sprintf("Rollout complete for buildID %s", vcfg.BuildID))
205+
// Update the in-memory status to reflect the promotion. The status was mapped
206+
// from Temporal state before plan execution, so it is stale at this point.
207+
// syncConditions (called at end of reconcile) derives Ready/Progressing from
208+
// TargetVersion.Status, so it must be current to avoid a one-cycle lag.
209+
workerDeploy.Status.TargetVersion.Status = temporaliov1alpha1.VersionStatusCurrent
207210
} else {
208211
if vcfg.RampPercentage > 0 {
209212
l.Info("applying ramp", "buildID", vcfg.BuildID, "percentage", vcfg.RampPercentage)
@@ -222,6 +225,14 @@ func (r *TemporalWorkerDeploymentReconciler) updateVersionConfig(ctx context.Con
222225
"Failed to set buildID %q as ramping version (%d%%): %v", vcfg.BuildID, vcfg.RampPercentage, err)
223226
return fmt.Errorf("unable to set ramping deployment version: %w", err)
224227
}
228+
// Same reasoning as the SetCurrent path above: update the in-memory status
229+
// so syncConditions sees the correct state on this reconcile cycle.
230+
if vcfg.RampPercentage > 0 {
231+
workerDeploy.Status.TargetVersion.Status = temporaliov1alpha1.VersionStatusRamping
232+
}
233+
// When RampPercentage == 0 we are clearing a stale ramp on a different build ID
234+
// (see planner: "Reset ramp if needed"). The target version is already Current,
235+
// so no in-memory status update is needed here.
225236
}
226237

227238
if _, err := deploymentHandler.UpdateVersionMetadata(ctx, sdkclient.WorkerDeploymentUpdateVersionMetadataOptions{

0 commit comments

Comments
 (0)