Skip to content

Commit 2ef71ec

Browse files
chaptersixjaypipes
andauthored
fix: make errInvalidComputeConfig non-retryable in deployment activities (#10127)
## Summary - Changed `UpdateWorkerControllerInstanceFromDeployment` (activities.go) and `UpdateWorkerControllerInstance` (version_activities.go) to return `NewNonRetryableApplicationError` instead of `NewApplicationError` when wrapping `InvalidArgument` errors from the WCI client - Added functional test `TestCreateWorkerDeploymentVersion_InvalidComputeConfig_ReturnsPromptly` that verifies the error returns promptly when creating a version with invalid compute config on a busy deployment ## Context The `TestCreateWorkerDeploymentVersion_Errors` test in the CLI repo (temporalio/cli) is flaky. When `create-version` is called with invalid AWS Lambda credentials, the expected `"failed to assume role"` error is sometimes replaced by `"too many requests"`. Root cause: both activities wrap `InvalidArgument` errors as **retryable** `ApplicationError`. The activity retries up to 5 times (each attempt timing out trying to reach AWS IMDS), keeping the deployment workflow busy. The deployment client's retry loop then times out after 1 minute and `convertAndRecordError` masks the actual error as `"too many requests"`. The existing unit tests (`workflow_test.go:1543`, `version_workflow_test.go:2383`) already mock the activity returning `NewNonRetryableApplicationError` -- the activities just weren't matching that expectation. ## Test plan - [x] Existing unit tests pass (`go test ./service/worker/workerdeployment/...`) - [x] Existing functional tests pass (`TestCreateWorkerDeploymentVersion_InvalidScalingGroups`, `TestCreateWorkerDeploymentVersion_InvalidArgs`) - [x] New functional test passes 5/5 runs, consistently under 3 seconds - [ ] Verify CLI `TestCreateWorkerDeploymentVersion_Errors` passes after bumping server dependency Co-authored-by: Jay Pipes <jaypipes@gmail.com>
1 parent e004436 commit 2ef71ec

3 files changed

Lines changed: 48 additions & 2 deletions

File tree

service/worker/workerdeployment/activities.go

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -285,7 +285,7 @@ func (a *Activities) UpdateWorkerControllerInstanceFromDeployment(ctx context.Co
285285
if err != nil {
286286
var invalidArgs *serviceerror.InvalidArgument
287287
if errors.As(err, &invalidArgs) {
288-
return nil, temporal.NewApplicationError(err.Error(), errInvalidComputeConfig)
288+
return nil, temporal.NewNonRetryableApplicationError(err.Error(), errInvalidComputeConfig, nil)
289289
}
290290
return nil, err
291291
}

service/worker/workerdeployment/version_activities.go

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -205,7 +205,7 @@ func (a *VersionActivities) UpdateWorkerControllerInstance(ctx context.Context,
205205
if err != nil {
206206
var invalidArgs *serviceerror.InvalidArgument
207207
if errors.As(err, &invalidArgs) {
208-
return nil, temporal.NewApplicationError(err.Error(), errInvalidComputeConfig)
208+
return nil, temporal.NewNonRetryableApplicationError(err.Error(), errInvalidComputeConfig, nil)
209209
}
210210
return nil, err
211211
}

tests/worker_deployment_version_test.go

Lines changed: 46 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -3626,6 +3626,52 @@ func (s *DeploymentVersionSuite) TestCreateWorkerDeploymentVersion_MultipleVersi
36263626
}, 10*time.Second, 500*time.Millisecond)
36273627
}
36283628

3629+
// TestCreateWorkerDeploymentVersion_InvalidComputeConfig_ReturnsPromptly verifies that
3630+
// creating a version with invalid compute config returns an InvalidArgument error promptly,
3631+
// even when the deployment workflow is busy (e.g., processing a recently polled version).
3632+
// This is a regression test for a bug where the activity error was retryable, causing the
3633+
// deployment workflow to stay busy and the client to time out with "too many requests".
3634+
func (s *DeploymentVersionSuite) TestCreateWorkerDeploymentVersion_InvalidComputeConfig_ReturnsPromptly() {
3635+
ctx, cancel := context.WithTimeout(context.Background(), time.Minute)
3636+
defer cancel()
3637+
3638+
tv := testvars.New(s)
3639+
3640+
// Create a version via polling, which auto-creates the deployment and keeps
3641+
// the deployment workflow active with lazy-creation processing.
3642+
s.startVersionWorkflow(ctx, tv)
3643+
3644+
// Immediately attempt to create another version on the same deployment with
3645+
// invalid compute config. This should return an InvalidArgument error
3646+
// promptly (not time out with "too many requests").
3647+
tv2 := tv.WithBuildIDNumber(2)
3648+
invalidProvider := computeprovider.TestInvokeComputeProviderInvalidComputeProvider()
3649+
start := time.Now()
3650+
_, err := s.FrontendClient().CreateWorkerDeploymentVersion(ctx, &workflowservice.CreateWorkerDeploymentVersionRequest{
3651+
Namespace: s.Namespace().String(),
3652+
DeploymentVersion: &deploymentpb.WorkerDeploymentVersion{
3653+
DeploymentName: tv.DeploymentSeries(),
3654+
BuildId: tv2.BuildID(),
3655+
},
3656+
RequestId: tv2.Any().String(),
3657+
ComputeConfig: &computepb.ComputeConfig{
3658+
ScalingGroups: map[string]*computepb.ComputeConfigScalingGroup{
3659+
"sg1": {Provider: invalidProvider},
3660+
},
3661+
},
3662+
})
3663+
elapsed := time.Since(start)
3664+
3665+
s.Error(err)
3666+
var invalidArg *serviceerror.InvalidArgument
3667+
s.ErrorAs(err, &invalidArg)
3668+
s.Contains(invalidArg.Message, "illegal_field found in config")
3669+
// The error should return well before the deployment client's 1-minute
3670+
// retry timeout. If it takes longer than 30 seconds, the activity error
3671+
// is likely still retryable (the bug we're testing for).
3672+
s.Less(elapsed, 30*time.Second, "create-version with invalid compute config should fail promptly, not time out")
3673+
}
3674+
36293675
func (s *DeploymentVersionSuite) TestCreateWorkerDeploymentVersion_InvalidScalingGroups() {
36303676
ctx, cancel := context.WithTimeout(context.Background(), time.Minute)
36313677
defer cancel()

0 commit comments

Comments
 (0)