Skip to content

Commit dd61c43

Browse files
committed
add support for benign application error category
1 parent e77d632 commit dd61c43

File tree

5 files changed

+110
-14
lines changed

5 files changed

+110
-14
lines changed

internal/error.go

+64
Original file line numberDiff line numberDiff line change
@@ -119,6 +119,9 @@ Workflow consumers will get an instance of *WorkflowExecutionError. This error w
119119
*/
120120

121121
type (
122+
// Category of the error. Maps to logging/metrics behaviours.
123+
ApplicationErrorCategory string
124+
122125
// ApplicationErrorOptions represents a combination of error attributes and additional requests.
123126
// All fields are optional, providing flexibility in error customization.
124127
//
@@ -137,6 +140,7 @@ type (
137140
//
138141
// NOTE: This option is supported by Temporal Server >= v1.24.2 older version will ignore this value.
139142
NextRetryDelay time.Duration
143+
Category ApplicationErrorCategory
140144
}
141145

142146
// ApplicationError returned from activity implementations with message and optional details.
@@ -150,6 +154,7 @@ type (
150154
cause error
151155
details converter.EncodedValues
152156
nextRetryDelay time.Duration
157+
category ApplicationErrorCategory
153158
}
154159

155160
// TimeoutError returned when activity or child workflow timed out.
@@ -380,6 +385,11 @@ var (
380385
ErrMissingWorkflowID = errors.New("workflow ID is unset for Nexus operation")
381386
)
382387

388+
const (
389+
// ErrorCategoryBenign indicates an error that is expected under normal operation and should not trigger alerts.
390+
ErrorCategoryBenign ApplicationErrorCategory = "benign"
391+
)
392+
383393
// NewApplicationError create new instance of *ApplicationError with message, type, and optional details.
384394
func NewApplicationError(msg string, errType string, nonRetryable bool, cause error, details ...interface{}) error {
385395
return NewApplicationErrorWithOptions(
@@ -397,6 +407,7 @@ func NewApplicationErrorWithOptions(msg string, errType string, options Applicat
397407
cause: options.Cause,
398408
nonRetryable: options.NonRetryable,
399409
nextRetryDelay: options.NextRetryDelay,
410+
category: options.Category,
400411
}
401412
// When return error to user, use EncodedValues as details and data is ready to be decoded by calling Get
402413
details := options.Details
@@ -661,6 +672,11 @@ func (e *ApplicationError) Unwrap() error {
661672
// a zero value means to use the activities retry policy.
662673
func (e *ApplicationError) NextRetryDelay() time.Duration { return e.nextRetryDelay }
663674

675+
// Category returns the ApplicationErrorCategory of the error.
676+
func (e *ApplicationError) Category() ApplicationErrorCategory {
677+
return e.category
678+
}
679+
664680
// Error from error interface
665681
func (e *TimeoutError) Error() string {
666682
msg := fmt.Sprintf("%s (type: %s)", e.message(), e.timeoutType)
@@ -1029,3 +1045,51 @@ func getErrType(err error) string {
10291045

10301046
return t.Name()
10311047
}
1048+
1049+
// applicationErrorCategoryToProto converts the internal ApplicationErrorCategory to its proto representation.
1050+
func applicationErrorCategoryToProto(category ApplicationErrorCategory) enumspb.ApplicationErrorCategory {
1051+
switch category {
1052+
case ErrorCategoryBenign:
1053+
return enumspb.APPLICATION_ERROR_CATEGORY_BENIGN
1054+
case "":
1055+
// Zero value maps to unspecified
1056+
return enumspb.APPLICATION_ERROR_CATEGORY_UNSPECIFIED
1057+
default:
1058+
// Fallback to unspecified if unknown case
1059+
return enumspb.APPLICATION_ERROR_CATEGORY_UNSPECIFIED
1060+
}
1061+
}
1062+
1063+
// applicationErrorCategoryFromProto converts the proto ApplicationErrorCategory to its internal representation.
1064+
func applicationErrorCategoryFromProto(category enumspb.ApplicationErrorCategory) ApplicationErrorCategory {
1065+
switch category {
1066+
case enumspb.APPLICATION_ERROR_CATEGORY_BENIGN:
1067+
return ErrorCategoryBenign
1068+
case enumspb.APPLICATION_ERROR_CATEGORY_UNSPECIFIED:
1069+
return "" // Map unspecified back to the zero value
1070+
default:
1071+
// Handle unknown proto values - map to the zero value (unspecified)
1072+
// Consider logging a warning if an unexpected proto value is received.
1073+
return ""
1074+
}
1075+
}
1076+
1077+
// IsBenignApplicationError checks if the given error is an ApplicationError
1078+
// with the category set to CategoryBenign.
1079+
func IsBenignApplicationError(err error) bool {
1080+
var appError *ApplicationError
1081+
// Use errors.As to check type and get value simultaneously.
1082+
// Then check if the category field matches CategoryBenign.
1083+
return errors.As(err, &appError) && appError.Category() == ErrorCategoryBenign
1084+
}
1085+
1086+
// isBenignProtoApplicationFailure checks if the given proto Failure represents
1087+
// an ApplicationFailure with the category set to Benign.
1088+
func isBenignProtoApplicationFailure(failure *failurepb.Failure) bool {
1089+
if failure == nil {
1090+
return false
1091+
}
1092+
appFailureInfo := failure.GetApplicationFailureInfo()
1093+
// Check if it's an ApplicationFailureInfo and if its category is Benign.
1094+
return appFailureInfo != nil && appFailureInfo.GetCategory() == enumspb.APPLICATION_ERROR_CATEGORY_BENIGN
1095+
}

internal/error_test.go

+19-2
Original file line numberDiff line numberDiff line change
@@ -723,6 +723,7 @@ func Test_convertErrorToFailure_ApplicationErrorWithExtraRequests(t *testing.T)
723723
NonRetryable: true,
724724
Cause: errors.New("cause error"),
725725
Details: []interface{}{"details", 2208},
726+
Category: ErrorCategoryBenign,
726727
},
727728
)
728729
f := fc.ErrorToFailure(err)
@@ -734,15 +735,27 @@ func Test_convertErrorToFailure_ApplicationErrorWithExtraRequests(t *testing.T)
734735
require.Equal("cause error", f.GetCause().GetMessage())
735736
require.Equal("", f.GetCause().GetApplicationFailureInfo().GetType())
736737
require.Nil(f.GetCause().GetCause())
738+
require.Equal(enumspb.APPLICATION_ERROR_CATEGORY_BENIGN, f.GetApplicationFailureInfo().GetCategory())
737739

738740
err2 := fc.FailureToError(f)
739741
var applicationErr *ApplicationError
740742
require.True(errors.As(err2, &applicationErr))
741743
require.Equal("message (type: customType, retryable: false): cause error", applicationErr.Error())
744+
require.Equal(ErrorCategoryBenign, applicationErr.Category())
742745

743746
err2 = errors.Unwrap(err2)
744747
require.True(errors.As(err2, &applicationErr))
745748
require.Equal("cause error", applicationErr.Error())
749+
750+
err := NewApplicationErrorWithOptions(
751+
"another message",
752+
"another customType",
753+
ApplicationErrorOptions{
754+
Category: "",
755+
},
756+
)
757+
f := fc.ErrorToFailure(err)
758+
require.Equal(enumspb.APPLICATION_ERROR_CATEGORY_UNSPECIFIED, f.GetApplicationFailureInfo().GetCategory())
746759
}
747760

748761
func Test_convertErrorToFailure_EncodeMessage(t *testing.T) {
@@ -1104,6 +1117,7 @@ func Test_convertFailureToError_ApplicationFailure(t *testing.T) {
11041117
Type: "MyCoolType",
11051118
NonRetryable: true,
11061119
Details: details,
1120+
Category: enumspb.APPLICATION_ERROR_CATEGORY_BENIGN,
11071121
}},
11081122
Cause: &failurepb.Failure{
11091123
Message: "cause message",
@@ -1120,6 +1134,7 @@ func Test_convertFailureToError_ApplicationFailure(t *testing.T) {
11201134
require.Equal("message (type: MyCoolType, retryable: false): cause message (type: UnknownType, retryable: true)", applicationErr.Error())
11211135
require.Equal("MyCoolType", applicationErr.Type())
11221136
require.Equal(true, applicationErr.NonRetryable())
1137+
require.Equal(ErrorCategoryBenign, applicationErr.Category())
11231138
var str string
11241139
var n int
11251140
require.NoError(applicationErr.Details(&str, &n))
@@ -1149,8 +1164,9 @@ func Test_convertFailureToError_ApplicationFailure(t *testing.T) {
11491164
f = &failurepb.Failure{
11501165
Message: "message",
11511166
FailureInfo: &failurepb.Failure_ApplicationFailureInfo{ApplicationFailureInfo: &failurepb.ApplicationFailureInfo{
1152-
Type: "CoolError",
1153-
Details: details,
1167+
Type: "CoolError",
1168+
Details: details,
1169+
Category: enumspb.APPLICATION_ERROR_CATEGORY_UNSPECIFIED,
11541170
}},
11551171
}
11561172

@@ -1160,6 +1176,7 @@ func Test_convertFailureToError_ApplicationFailure(t *testing.T) {
11601176
require.Equal("message (type: CoolError, retryable: true)", coolErr.Error())
11611177
require.Equal("CoolError", coolErr.Type())
11621178
require.Equal(false, coolErr.NonRetryable())
1179+
require.Equal("", coolErr.Category())
11631180
}
11641181

11651182
func Test_convertFailureToError_CanceledFailure(t *testing.T) {

internal/failure_converter.go

+2
Original file line numberDiff line numberDiff line change
@@ -115,6 +115,7 @@ func (dfc *DefaultFailureConverter) ErrorToFailure(err error) *failurepb.Failure
115115
NonRetryable: err.NonRetryable(),
116116
Details: convertErrDetailsToPayloads(err.details, dfc.dataConverter),
117117
NextRetryDelay: delay,
118+
Category: applicationErrorCategoryToProto(err.Category()),
118119
}
119120
failure.FailureInfo = &failurepb.Failure_ApplicationFailureInfo{ApplicationFailureInfo: failureInfo}
120121
case *CanceledError:
@@ -250,6 +251,7 @@ func (dfc *DefaultFailureConverter) FailureToError(failure *failurepb.Failure) e
250251
Cause: dfc.FailureToError(failure.GetCause()),
251252
Details: []interface{}{details},
252253
NextRetryDelay: nextRetryDelay,
254+
Category: applicationErrorCategoryFromProto(applicationFailureInfo.GetCategory()),
253255
},
254256
)
255257
}

internal/internal_task_handlers.go

+5-1
Original file line numberDiff line numberDiff line change
@@ -2311,7 +2311,11 @@ func (ath *activityTaskHandlerImpl) Execute(taskQueue string, t *workflowservice
23112311
return nil, ctx.Err()
23122312
}
23132313
if err != nil && err != ErrActivityResultPending {
2314-
ath.logger.Error("Activity error.",
2314+
logFunc := ath.logger.Error // Default to Error
2315+
if IsBenignApplicationError(err) {
2316+
logFunc = ath.logger.Debug // Use Debug for benign application errors
2317+
}
2318+
logFunc("Activity error.",
23152319
tagWorkflowID, t.WorkflowExecution.GetWorkflowId(),
23162320
tagRunID, t.WorkflowExecution.GetRunId(),
23172321
tagActivityType, activityType,

internal/internal_task_pollers.go

+20-11
Original file line numberDiff line numberDiff line change
@@ -474,19 +474,26 @@ func (wtp *workflowTaskPoller) RespondTaskCompletedWithMetrics(
474474
) (response *workflowservice.RespondWorkflowTaskCompletedResponse, err error) {
475475
metricsHandler := wtp.metricsHandler.WithTags(metrics.WorkflowTags(task.WorkflowType.GetName()))
476476
if taskErr != nil {
477-
wtp.logger.Warn("Failed to process workflow task.",
478-
tagWorkflowType, task.WorkflowType.GetName(),
479-
tagWorkflowID, task.WorkflowExecution.GetWorkflowId(),
480-
tagRunID, task.WorkflowExecution.GetRunId(),
481-
tagAttempt, task.Attempt,
482-
tagError, taskErr)
483477
failWorkflowTask := wtp.errorToFailWorkflowTask(task.TaskToken, taskErr)
478+
completedRequest = failWorkflowTask
479+
484480
failureReason := "WorkflowError"
485481
if failWorkflowTask.Cause == enumspb.WORKFLOW_TASK_FAILED_CAUSE_NON_DETERMINISTIC_ERROR {
486482
failureReason = "NonDeterminismError"
487483
}
488-
incrementWorkflowTaskFailureCounter(metricsHandler, failureReason)
489-
completedRequest = failWorkflowTask
484+
485+
logFunc := wtp.logger.Warn
486+
if IsBenignApplicationError(taskErr) {
487+
logFunc = wtp.logger.Debug
488+
} else {
489+
incrementWorkflowTaskFailureCounter(metricsHandler, failureReason)
490+
}
491+
logFunc("Failed to process workflow task.",
492+
tagWorkflowType, task.WorkflowType.GetName(),
493+
tagWorkflowID, task.WorkflowExecution.GetWorkflowId(),
494+
tagRunID, task.WorkflowExecution.GetRunId(),
495+
tagAttempt, task.Attempt,
496+
tagError, taskErr)
490497
}
491498

492499
metricsHandler.Timer(metrics.WorkflowTaskExecutionLatency).Record(time.Since(startTime))
@@ -705,7 +712,7 @@ func (lath *localActivityTaskHandler) executeLocalActivityTask(task *localActivi
705712
metricsHandler.Counter(metrics.LocalActivityErrorCounter).Inc(1)
706713
err = newPanicError(p, st)
707714
}
708-
if err != nil {
715+
if err != nil && !IsBenignApplicationError(err) {
709716
metricsHandler.Counter(metrics.LocalActivityFailedCounter).Inc(1)
710717
metricsHandler.Counter(metrics.LocalActivityExecutionFailedCounter).Inc(1)
711718
}
@@ -1104,8 +1111,10 @@ func (atp *activityTaskPoller) ProcessTask(task interface{}) error {
11041111
return err
11051112
}
11061113
// in case if activity execution failed, request should be of type RespondActivityTaskFailedRequest
1107-
if _, ok := request.(*workflowservice.RespondActivityTaskFailedRequest); ok {
1108-
activityMetricsHandler.Counter(metrics.ActivityExecutionFailedCounter).Inc(1)
1114+
if req, ok := request.(*workflowservice.RespondActivityTaskFailedRequest); ok {
1115+
if !isBenignProtoApplicationFailure(req.Failure) {
1116+
activityMetricsHandler.Counter(metrics.ActivityExecutionFailedCounter).Inc(1)
1117+
}
11091118
}
11101119
activityMetricsHandler.Timer(metrics.ActivityExecutionLatency).Record(time.Since(executionStartTime))
11111120

0 commit comments

Comments
 (0)