Skip to content

Commit 02e0483

Browse files
carlydfclaude
andauthored
Prepare to include cluster UID in CONTROLLER_IDENTITY to prevent cross-cluster conflicts (#308)
<!--- Note to EXTERNAL Contributors --> <!-- Thanks for opening a PR! If it is a significant code change, please **make sure there is an open issue** for this. We work best with you when we have accepted the idea first before you code. --> <!--- For ALL Contributors 👇 --> ## What was changed In `shouldClaimManagerIdentity`, detect future format of manager identity ## Why? To facilitate clean reclaim after rollback from the next release, which will include #309 ## Checklist <!--- add/delete as needed ---> 1. Closes <!-- add issue number here --> 2. How was this tested: Envtest won't test our get ns permissions, so just trusting there (we will test it in CI env because it runs on controller startup) 3. Any docs updates needed? <!--- update README if applicable or point out where to update docs.temporal.io --> --------- Co-authored-by: Claude Sonnet 4.6 <noreply@anthropic.com>
1 parent 9ff2733 commit 02e0483

11 files changed

Lines changed: 79 additions & 22 deletions

File tree

cmd/main.go

Lines changed: 19 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -5,15 +5,19 @@
55
package main
66

77
import (
8+
"context"
89
"flag"
10+
"fmt"
911
"log/slog"
1012
"os"
1113

1214
temporaliov1alpha1 "github.com/temporalio/temporal-worker-controller/api/v1alpha1"
1315
"github.com/temporalio/temporal-worker-controller/internal/controller"
1416
"github.com/temporalio/temporal-worker-controller/internal/controller/clientpool"
1517
"go.temporal.io/sdk/log"
18+
corev1 "k8s.io/api/core/v1"
1619
"k8s.io/apimachinery/pkg/runtime"
20+
"k8s.io/apimachinery/pkg/types"
1721
utilruntime "k8s.io/apimachinery/pkg/util/runtime"
1822
clientgoscheme "k8s.io/client-go/kubernetes/scheme"
1923
// Import all Kubernetes client auth plugins (e.g. Azure, GCP, OIDC, etc.)
@@ -112,6 +116,21 @@ func main() {
112116
os.Exit(1)
113117
}
114118

119+
podNamespace := os.Getenv("POD_NAMESPACE")
120+
if podNamespace == "" {
121+
setupLog.Error(nil, "POD_NAMESPACE environment variable must be set")
122+
os.Exit(1)
123+
}
124+
var ns corev1.Namespace
125+
if err := mgr.GetAPIReader().Get(context.Background(), types.NamespacedName{Name: podNamespace}, &ns); err != nil {
126+
setupLog.Error(err, "unable to fetch namespace UID for controller identity suffix")
127+
os.Exit(1)
128+
}
129+
if err := os.Setenv(controller.IdentitySuffixEnvKey, string(ns.UID)); err != nil {
130+
setupLog.Error(err, fmt.Sprintf("unable to set %s", controller.IdentitySuffixEnvKey))
131+
os.Exit(1)
132+
}
133+
115134
setupLog.Info("starting manager")
116135
if err := mgr.Start(ctrl.SetupSignalHandler()); err != nil {
117136
setupLog.Error(err, "problem running manager")

docs/README.md

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -33,7 +33,7 @@ Technical constraints and limitations of the Temporal Worker Controller system,
3333
Comprehensive guide for migrating from existing unversioned worker deployment systems to the Temporal Worker Controller. Includes step-by-step instructions, configuration mapping, and common patterns.
3434
See [Migration to Unversioned](migration-to-unversioned.md) for how to migrate back to an unversioned deployment system.
3535

36-
### [Ownership](ownership.md)
36+
### [Ownership](manager-identity.md)
3737
How the controller gets permission to manage a Worker Deployment, how a human client can take or give back control.
3838

3939
### [WorkerResourceTemplate](worker-resource-templates.md)
Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,4 @@
1-
# Ownership Transfer in the Worker Controller
1+
# Manager Identity and Ownership Transfer in the Worker Controller
22

33
## Problem
44

helm/temporal-worker-controller/templates/rbac.yaml

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -71,6 +71,12 @@ rules:
7171
verbs:
7272
- create
7373
- patch
74+
- apiGroups:
75+
- ""
76+
resources:
77+
- namespaces
78+
verbs:
79+
- get
7480
- apiGroups:
7581
- ""
7682
resources:

internal/controller/execplan.go

Lines changed: 16 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -184,7 +184,17 @@ func (r *TemporalWorkerDeploymentReconciler) startTestWorkflows(ctx context.Cont
184184
}
185185

186186
func (r *TemporalWorkerDeploymentReconciler) shouldClaimManagerIdentity(vcfg *planner.VersionConfig) bool {
187-
return vcfg.ManagerIdentity == ""
187+
existing := vcfg.ManagerIdentity
188+
if existing == "" {
189+
return true // unclaimed
190+
}
191+
// In the next release, the namespace UID will be included in the controller identity.
192+
// To support smooth rollback, in this release, we will detect the future format and
193+
// treat that as a reclaimable claim.
194+
if existing == getControllerIdentityWithNamespaceUID() {
195+
return true
196+
}
197+
return false
188198
}
189199

190200
func (r *TemporalWorkerDeploymentReconciler) claimManagerIdentity(
@@ -194,18 +204,19 @@ func (r *TemporalWorkerDeploymentReconciler) claimManagerIdentity(
194204
deploymentHandler sdkclient.WorkerDeploymentHandle,
195205
vcfg *planner.VersionConfig,
196206
) error {
207+
identity := getControllerIdentity()
197208
resp, err := deploymentHandler.SetManagerIdentity(ctx, sdkclient.WorkerDeploymentSetManagerIdentityOptions{
198209
Self: true,
199210
ConflictToken: vcfg.ConflictToken,
200-
Identity: getControllerIdentity(),
211+
Identity: identity,
201212
})
202213
if err != nil {
203214
l.Error(err, "unable to claim manager identity")
204215
r.Recorder.Eventf(workerDeploy, corev1.EventTypeWarning, ReasonManagerIdentityClaimFailed,
205216
"Failed to claim manager identity: %v", err)
206217
return err
207218
}
208-
l.Info("claimed manager identity", "identity", getControllerIdentity())
219+
l.Info("claimed manager identity", "identity", identity)
209220
// Use the updated conflict token for the subsequent routing config change.
210221
vcfg.ConflictToken = resp.ConflictToken
211222
return nil
@@ -275,8 +286,8 @@ func (r *TemporalWorkerDeploymentReconciler) updateVersionConfig(ctx context.Con
275286
},
276287
MetadataUpdate: sdkclient.WorkerDeploymentMetadataUpdate{
277288
UpsertEntries: map[string]interface{}{
278-
controllerIdentityMetadataKey: getControllerIdentity(),
279-
controllerVersionMetadataKey: getControllerVersion(),
289+
IdentityMetadataKey: getControllerIdentity(),
290+
VersionMetadataKey: getControllerVersion(),
280291
},
281292
},
282293
}); err != nil { // would be cool to do this atomically with the update

internal/controller/suite_test.go

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -5,6 +5,7 @@
55
package controller
66

77
import (
8+
"os"
89
"path/filepath"
910
"testing"
1011

@@ -26,6 +27,11 @@ var cfg *rest.Config
2627
var k8sClient client.Client
2728
var testEnv *envtest.Environment
2829

30+
func TestMain(m *testing.M) {
31+
_ = os.Setenv(IdentityEnvKey, "test-controller-identity")
32+
os.Exit(m.Run())
33+
}
34+
2935
func TestControllers(t *testing.T) {
3036
RegisterFailHandler(Fail)
3137

internal/controller/util.go

Lines changed: 20 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -31,12 +31,13 @@ const (
3131
)
3232

3333
const (
34-
controllerIdentityMetadataKey = "temporal.io/controller"
35-
controllerVersionMetadataKey = "temporal.io/controller-version"
34+
IdentityMetadataKey = "temporal.io/controller"
35+
VersionMetadataKey = "temporal.io/controller-version"
3636

37-
controllerVersionEnvKey = "CONTROLLER_VERSION"
38-
controllerIdentityEnvKey = "CONTROLLER_IDENTITY"
39-
ControllerMaxDeploymentVersionsIneligibleForDeletionEnvKey = "CONTROLLER_MAX_DEPLOYMENT_VERSIONS_INELIGIBLE_FOR_DELETION"
37+
VersionEnvKey = "CONTROLLER_VERSION"
38+
IdentityEnvKey = "CONTROLLER_IDENTITY"
39+
IdentitySuffixEnvKey = "CONTROLLER_IDENTITY_SUFFIX"
40+
MaxDeploymentVersionsIneligibleForDeletionEnvKey = "CONTROLLER_MAX_DEPLOYMENT_VERSIONS_INELIGIBLE_FOR_DELETION"
4041

4142
serverDeleteVersionIdentity = "try-delete-for-add-version"
4243
)
@@ -51,22 +52,31 @@ func getControllerVersion() string {
5152
return Version
5253
}
5354
// Fall back to environment variable (set by Helm from image.tag)
54-
if version := os.Getenv(controllerVersionEnvKey); version != "" {
55+
if version := os.Getenv(VersionEnvKey); version != "" {
5556
return version
5657
}
5758
return "unknown"
5859
}
5960

60-
// getControllerIdentity returns the identity from environment variable (set by Helm)
61+
// getControllerIdentity returns the identity from environment variable (set by Helm).
62+
// Returns empty string if unset. main() enforces this at startup, but that check is
63+
// bypassed if the reconciler is used as a library (e.g. embedded in another controller
64+
// manager or in tests). An empty return means the env var was not set before starting.
6165
func getControllerIdentity() string {
62-
if identity := os.Getenv(controllerIdentityEnvKey); identity != "" {
66+
if identity := os.Getenv(IdentityEnvKey); identity != "" {
6367
return identity
6468
}
65-
return defaults.ControllerIdentity
69+
return defaults.ToBeDeprecatedDefaultControllerIdentity
70+
}
71+
72+
// getControllerIdentityWithNamespaceUID returns the identity which will be used in the
73+
// next release. Used in this release for smooth rollback identity reclamation.
74+
func getControllerIdentityWithNamespaceUID() string {
75+
return getControllerIdentity() + "/" + os.Getenv(IdentitySuffixEnvKey)
6676
}
6777

6878
func GetControllerMaxDeploymentVersionsIneligibleForDeletion() int32 {
69-
if maxStr := os.Getenv(ControllerMaxDeploymentVersionsIneligibleForDeletionEnvKey); maxStr != "" {
79+
if maxStr := os.Getenv(MaxDeploymentVersionsIneligibleForDeletionEnvKey); maxStr != "" {
7080
i, err := strconv.Atoi(maxStr)
7181
if err == nil {
7282
return int32(i)

internal/controller/worker_controller.go

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -112,6 +112,7 @@ type TemporalWorkerDeploymentReconciler struct {
112112
// +kubebuilder:rbac:groups=temporal.io,resources=temporalworkerdeployments/finalizers,verbs=update
113113
// +kubebuilder:rbac:groups=temporal.io,resources=temporalconnections,verbs=get;list;watch;update;patch
114114
// +kubebuilder:rbac:groups=temporal.io,resources=temporalconnections/finalizers,verbs=update
115+
// +kubebuilder:rbac:groups=core,resources=namespaces,verbs=get
115116
// +kubebuilder:rbac:groups=core,resources=secrets,verbs=get;list;watch
116117
// +kubebuilder:rbac:groups=apps,resources=deployments,verbs=get;list;watch;create;update;patch;delete
117118
// +kubebuilder:rbac:groups=apps,resources=deployments/scale,verbs=update
@@ -133,6 +134,7 @@ func (r *TemporalWorkerDeploymentReconciler) Reconcile(ctx context.Context, req
133134
defer cancel()
134135

135136
l := log.FromContext(ctx)
137+
136138
l.V(1).Info("Running Reconcile loop")
137139

138140
// Fetch the worker deployment

internal/defaults/defaults.go

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -11,5 +11,7 @@ const (
1111
DeleteDelay = 24 * time.Hour
1212
ServerMaxVersions = 100
1313
MaxVersionsIneligibleForDeletion = int32(ServerMaxVersions * 0.75)
14-
ControllerIdentity = "temporal-worker-controller"
14+
15+
// ToBeDeprecatedDefaultControllerIdentity will stop being used in the next release.
16+
ToBeDeprecatedDefaultControllerIdentity = "temporal-worker-controller"
1517
)

internal/tests/internal/env_helpers.go

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -41,6 +41,7 @@ const (
4141
testDrainageRefreshInterval = time.Second
4242
testMaxVersionsIneligibleForDeletion = 5
4343
testMaxVersionsInDeployment = 6
44+
testControllerIdentity = "test-controller-identity"
4445
)
4546

4647
// setupKubebuilderAssets sets up the KUBEBUILDER_ASSETS environment variable if not already set
@@ -95,12 +96,13 @@ func getRepoRoot(t *testing.T) string {
9596
func setupTestEnvironment(t *testing.T) (*rest.Config, client.Client, manager.Manager, *clientpool.ClientPool, func()) {
9697
// Set faster reconcile interval for testing
9798
t.Setenv("RECONCILE_INTERVAL", "1s")
99+
t.Setenv(controller.IdentityEnvKey, testControllerIdentity)
98100
if kubeAssets := os.Getenv("KUBEBUILDER_ASSETS"); kubeAssets == "" {
99101
t.Skip("Skipping because KUBEBUILDER_ASSETS not set")
100102
}
101103

102104
// set max versions value for testing
103-
t.Setenv(controller.ControllerMaxDeploymentVersionsIneligibleForDeletionEnvKey, fmt.Sprintf("%d", testMaxVersionsIneligibleForDeletion))
105+
t.Setenv(controller.MaxDeploymentVersionsIneligibleForDeletionEnvKey, fmt.Sprintf("%d", testMaxVersionsIneligibleForDeletion))
104106

105107
// Setup kubebuilder assets for IDE testing
106108
if err := setupKubebuilderAssets(); err != nil {

0 commit comments

Comments
 (0)