Skip to content

Commit b7ba0a5

Browse files
committed
Fix race conditions and improve finalizer reliability
- Replace stale list iteration with proper polling using fresh queries - Add timeout and context cancellation handling with configurable constants - Implement field selector optimization with backward compatibility fallback - Replace brittle time.Sleep with condition-based polling in integration tests - Add comprehensive edge case tests for context cancellation and partial cleanup failures 🤖 Generated with [Claude Code](https://claude.ai/code)
1 parent 8b50488 commit b7ba0a5

3 files changed

Lines changed: 285 additions & 48 deletions

File tree

internal/controller/finalizer_test.go

Lines changed: 161 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -6,6 +6,7 @@ package controller
66

77
import (
88
"context"
9+
"strings"
910
"testing"
1011

1112
temporaliov1alpha1 "github.com/temporalio/temporal-worker-controller/api/v1alpha1"
@@ -239,3 +240,163 @@ func TestHandleDeletion(t *testing.T) {
239240
// by checking if the finalizer was removed (which we can't easily do since the resource is deleted)
240241
// Instead, we'll verify that the deletion handling completed without error, which means cleanup was successful
241242
}
243+
244+
func TestCleanupWithContextCancellation(t *testing.T) {
245+
// Create a context that will be cancelled during cleanup
246+
ctx, cancel := context.WithCancel(context.Background())
247+
248+
// Create a TemporalWorkerDeployment using test helpers
249+
workerDeploy := testhelpers.ModifyObj(testhelpers.MakeTWDWithName("test-worker", "default"), func(twd *temporaliov1alpha1.TemporalWorkerDeployment) *temporaliov1alpha1.TemporalWorkerDeployment {
250+
twd.UID = "worker-uid-123"
251+
return twd
252+
})
253+
254+
// Create fake client using test helpers
255+
client := testhelpers.SetupFakeClient()
256+
257+
reconciler := &TemporalWorkerDeploymentReconciler{
258+
Client: client,
259+
Scheme: testhelpers.SetupTestScheme(),
260+
}
261+
262+
// Create a test logger using testlogr
263+
logger := testlogr.New(t)
264+
265+
// Cancel the context immediately to simulate cancellation during cleanup
266+
cancel()
267+
268+
// Test cleanup with cancelled context - should handle gracefully
269+
err := reconciler.cleanupManagedResources(ctx, logger, workerDeploy)
270+
if err == nil {
271+
t.Error("Expected error when context is cancelled during cleanup")
272+
}
273+
274+
// Error should indicate context cancellation
275+
if ctx.Err() != context.Canceled {
276+
t.Error("Context should be cancelled")
277+
}
278+
}
279+
280+
func TestWaitForOwnedDeploymentsTimeout(t *testing.T) {
281+
ctx := context.Background()
282+
283+
// Create a TemporalWorkerDeployment using test helpers
284+
workerDeploy := testhelpers.ModifyObj(testhelpers.MakeTWDWithName("test-worker", "default"), func(twd *temporaliov1alpha1.TemporalWorkerDeployment) *temporaliov1alpha1.TemporalWorkerDeployment {
285+
twd.UID = "worker-uid-123"
286+
return twd
287+
})
288+
289+
// Create a deployment that won't be deleted (simulate stuck deletion)
290+
persistentDeployment := &appsv1.Deployment{
291+
ObjectMeta: metav1.ObjectMeta{
292+
Name: "persistent-deployment",
293+
Namespace: "default",
294+
OwnerReferences: []metav1.OwnerReference{
295+
{
296+
APIVersion: apiGVStr,
297+
Kind: "TemporalWorkerDeployment",
298+
Name: "test-worker",
299+
UID: "worker-uid-123",
300+
},
301+
},
302+
},
303+
}
304+
305+
// Create fake client with the deployment that won't be deleted
306+
client := testhelpers.SetupFakeClient(persistentDeployment)
307+
308+
reconciler := &TemporalWorkerDeploymentReconciler{
309+
Client: client,
310+
Scheme: testhelpers.SetupTestScheme(),
311+
}
312+
313+
// Create a test logger using testlogr
314+
logger := testlogr.New(t)
315+
316+
// Test with a very short timeout to simulate timeout condition
317+
// This will use the actual waitForOwnedDeploymentsToBeDeleted method which has built-in timeout
318+
err := reconciler.waitForOwnedDeploymentsToBeDeleted(ctx, logger, workerDeploy)
319+
320+
// Should timeout waiting for deployments to be deleted
321+
if err == nil {
322+
t.Error("Expected timeout error when deployments don't get deleted")
323+
}
324+
325+
// Error message should indicate timeout
326+
if err != nil && !contains(err.Error(), "timeout") {
327+
t.Errorf("Expected timeout error, got: %v", err)
328+
}
329+
}
330+
331+
func TestPartialCleanupFailure(t *testing.T) {
332+
ctx := context.Background()
333+
334+
// Create a TemporalWorkerDeployment using test helpers
335+
workerDeploy := testhelpers.ModifyObj(testhelpers.MakeTWDWithName("test-worker", "default"), func(twd *temporaliov1alpha1.TemporalWorkerDeployment) *temporaliov1alpha1.TemporalWorkerDeployment {
336+
twd.UID = "worker-uid-123"
337+
return twd
338+
})
339+
340+
// Create multiple deployments owned by the worker deployment
341+
deployment1 := &appsv1.Deployment{
342+
ObjectMeta: metav1.ObjectMeta{
343+
Name: "deployment-1",
344+
Namespace: "default",
345+
OwnerReferences: []metav1.OwnerReference{
346+
{
347+
APIVersion: apiGVStr,
348+
Kind: "TemporalWorkerDeployment",
349+
Name: "test-worker",
350+
UID: "worker-uid-123",
351+
},
352+
},
353+
},
354+
}
355+
356+
deployment2 := &appsv1.Deployment{
357+
ObjectMeta: metav1.ObjectMeta{
358+
Name: "deployment-2",
359+
Namespace: "default",
360+
OwnerReferences: []metav1.OwnerReference{
361+
{
362+
APIVersion: apiGVStr,
363+
Kind: "TemporalWorkerDeployment",
364+
Name: "test-worker",
365+
UID: "worker-uid-123",
366+
},
367+
},
368+
},
369+
}
370+
371+
// Create fake client with multiple deployments
372+
client := testhelpers.SetupFakeClient(deployment1, deployment2)
373+
374+
reconciler := &TemporalWorkerDeploymentReconciler{
375+
Client: client,
376+
Scheme: testhelpers.SetupTestScheme(),
377+
}
378+
379+
// Create a test logger using testlogr
380+
logger := testlogr.New(t)
381+
382+
// Delete one deployment manually to simulate partial cleanup
383+
err := client.Delete(ctx, deployment1)
384+
if err != nil {
385+
t.Fatalf("Failed to delete deployment1: %v", err)
386+
}
387+
388+
// Now test cleanup - it should handle the mixed state gracefully
389+
// (one deployment already deleted, one still exists)
390+
err = reconciler.cleanupManagedResources(ctx, logger, workerDeploy)
391+
392+
// This should eventually succeed as the cleanup logic should handle
393+
// deployments that are already deleted gracefully
394+
if err != nil && !contains(err.Error(), "timeout") {
395+
t.Errorf("Cleanup should handle partial cleanup gracefully, got error: %v", err)
396+
}
397+
}
398+
399+
// Helper function to check if a string contains a substring
400+
func contains(s, substr string) bool {
401+
return strings.Contains(s, substr)
402+
}

internal/controller/worker_controller.go

Lines changed: 77 additions & 26 deletions
Original file line numberDiff line numberDiff line change
@@ -17,6 +17,7 @@ import (
1717
appsv1 "k8s.io/api/apps/v1"
1818
apierrors "k8s.io/apimachinery/pkg/api/errors"
1919
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
20+
"k8s.io/apimachinery/pkg/fields"
2021
"k8s.io/apimachinery/pkg/runtime"
2122
"k8s.io/apimachinery/pkg/types"
2223
ctrl "sigs.k8s.io/controller-runtime"
@@ -38,6 +39,10 @@ const (
3839
buildIDLabel = "temporal.io/build-id"
3940
// TemporalWorkerDeploymentFinalizer is the finalizer used to ensure proper cleanup of resources
4041
TemporalWorkerDeploymentFinalizer = "temporal.io/temporal-worker-deployment-finalizer"
42+
43+
// Cleanup timeout and polling constants
44+
cleanupTimeout = 2 * time.Minute
45+
cleanupPollInterval = 5 * time.Second
4146
)
4247

4348
// TemporalWorkerDeploymentReconciler reconciles a TemporalWorkerDeployment object
@@ -238,18 +243,29 @@ func (r *TemporalWorkerDeploymentReconciler) handleDeletion(ctx context.Context,
238243
func (r *TemporalWorkerDeploymentReconciler) cleanupManagedResources(ctx context.Context, l logr.Logger, workerDeploy *temporaliov1alpha1.TemporalWorkerDeployment) error {
239244
l.Info("Cleaning up managed resources")
240245

241-
// List all deployments owned by this TemporalWorkerDeployment
242-
deploymentList := &appsv1.DeploymentList{}
246+
// Try to use field selector for efficient querying of owned deployments
247+
// Fall back to listing all deployments if field selector is not available (e.g., in tests)
243248
listOpts := &client.ListOptions{
244-
Namespace: workerDeploy.Namespace,
249+
Namespace: workerDeploy.Namespace,
250+
FieldSelector: fields.OneTermEqualSelector(deployOwnerKey, workerDeploy.Name),
245251
}
246252

247-
if err := r.List(ctx, deploymentList, listOpts); err != nil {
248-
return fmt.Errorf("failed to list deployments: %w", err)
253+
deploymentList := &appsv1.DeploymentList{}
254+
err := r.List(ctx, deploymentList, listOpts)
255+
if err != nil {
256+
// If field selector fails (common in tests), fall back to listing all deployments
257+
l.Info("Field selector not available, falling back to listing all deployments", "error", err.Error())
258+
listOpts = &client.ListOptions{
259+
Namespace: workerDeploy.Namespace,
260+
}
261+
if err := r.List(ctx, deploymentList, listOpts); err != nil {
262+
return fmt.Errorf("failed to list deployments: %w", err)
263+
}
249264
}
250265

251-
// Filter deployments owned by this TemporalWorkerDeployment and delete them
266+
// Delete all owned deployments
252267
for _, deployment := range deploymentList.Items {
268+
// Check ownership for all deployments when not using field selector
253269
if r.isOwnedByWorkerDeployment(&deployment, workerDeploy) {
254270
l.Info("Deleting managed deployment", "deployment", deployment.Name)
255271
if err := r.Delete(ctx, &deployment); err != nil && !apierrors.IsNotFound(err) {
@@ -258,29 +274,64 @@ func (r *TemporalWorkerDeploymentReconciler) cleanupManagedResources(ctx context
258274
}
259275
}
260276

261-
// Wait for all owned deployments to be deleted
262-
for _, deployment := range deploymentList.Items {
263-
if r.isOwnedByWorkerDeployment(&deployment, workerDeploy) {
264-
// Check if deployment still exists
265-
currentDeployment := &appsv1.Deployment{}
266-
err := r.Get(ctx, types.NamespacedName{
267-
Namespace: deployment.Namespace,
268-
Name: deployment.Name,
269-
}, currentDeployment)
270-
271-
if err == nil {
272-
// Deployment still exists, requeue to wait for deletion
273-
l.Info("Waiting for deployment to be deleted", "deployment", deployment.Name)
274-
return fmt.Errorf("still waiting for deployment %s to be deleted", deployment.Name)
275-
} else if !apierrors.IsNotFound(err) {
276-
return fmt.Errorf("failed to check deployment status %s: %w", deployment.Name, err)
277+
// Wait for all owned deployments to be deleted with proper polling
278+
return r.waitForOwnedDeploymentsToBeDeleted(ctx, l, workerDeploy)
279+
}
280+
281+
// waitForOwnedDeploymentsToBeDeleted waits for all owned deployments to be deleted with proper polling and timeout
282+
func (r *TemporalWorkerDeploymentReconciler) waitForOwnedDeploymentsToBeDeleted(ctx context.Context, l logr.Logger, workerDeploy *temporaliov1alpha1.TemporalWorkerDeployment) error {
283+
// Create a timeout context for cleanup operations
284+
cleanupCtx, cancel := context.WithTimeout(ctx, cleanupTimeout)
285+
defer cancel()
286+
287+
ticker := time.NewTicker(cleanupPollInterval)
288+
defer ticker.Stop()
289+
290+
l.Info("Waiting for owned deployments to be deleted", "timeout", cleanupTimeout)
291+
292+
for {
293+
select {
294+
case <-cleanupCtx.Done():
295+
if cleanupCtx.Err() == context.DeadlineExceeded {
296+
return fmt.Errorf("timeout waiting for deployments to be deleted after %v", cleanupTimeout)
297+
}
298+
return fmt.Errorf("context cancelled while waiting for deployments to be deleted: %w", cleanupCtx.Err())
299+
300+
case <-ticker.C:
301+
// Try to use field selector for efficient querying, with fallback
302+
listOpts := &client.ListOptions{
303+
Namespace: workerDeploy.Namespace,
304+
FieldSelector: fields.OneTermEqualSelector(deployOwnerKey, workerDeploy.Name),
305+
}
306+
307+
deploymentList := &appsv1.DeploymentList{}
308+
err := r.List(cleanupCtx, deploymentList, listOpts)
309+
if err != nil {
310+
// If field selector fails (common in tests), fall back to listing all deployments
311+
listOpts = &client.ListOptions{
312+
Namespace: workerDeploy.Namespace,
313+
}
314+
if err := r.List(cleanupCtx, deploymentList, listOpts); err != nil {
315+
return fmt.Errorf("failed to list deployments during cleanup: %w", err)
316+
}
317+
}
318+
319+
// Check if any owned deployments still exist
320+
hasOwnedDeployments := false
321+
for _, deployment := range deploymentList.Items {
322+
if r.isOwnedByWorkerDeployment(&deployment, workerDeploy) {
323+
hasOwnedDeployments = true
324+
l.Info("Still waiting for deployment to be deleted", "deployment", deployment.Name)
325+
break
326+
}
327+
}
328+
329+
if !hasOwnedDeployments {
330+
l.Info("All owned deployments have been deleted")
331+
return nil
277332
}
278-
// IsNotFound error means deployment was successfully deleted
279333
}
280334
}
281-
282-
l.Info("All managed resources have been cleaned up")
283-
return nil
284335
}
285336

286337
// isOwnedByWorkerDeployment checks if a deployment is owned by the given TemporalWorkerDeployment

0 commit comments

Comments
 (0)