Skip to content

Commit 8ba8378

Browse files
authored
Merge pull request #641 from govuk-one-login/OJ-3243-jwt-alarms
OJ-3243: Refactor JWT verify alarms
2 parents a9174c7 + 880cab6 commit 8ba8378

1 file changed

Lines changed: 159 additions & 33 deletions

File tree

infrastructure/template.yaml

Lines changed: 159 additions & 33 deletions
Original file line numberDiff line numberDiff line change
@@ -351,59 +351,185 @@ Resources:
351351
MetricNamespace: !Sub "${AWS::StackName}/LogMessages"
352352
MetricName: "OTGFunction-Fatalerror"
353353

354-
SessionLambdaFailedToVerifyJWTAlarm:
354+
SessionLambdaFailedToVerifyJWTWarningAlarm:
355355
Type: AWS::CloudWatch::Alarm
356+
Condition: DeployAlarms
356357
Properties:
357358
AlarmDescription: !Sub
358-
- "Errors verifying JWTs that have been been received by the session lambda. Runbook: ${SupportManualURL}"
359+
- "Errors verifying JWTs (jwt_verification_failed) rate exceeds 10% of Session Lambda invocations consecutively for 3, 5 minute periods. Runbook: ${SupportManualURL}"
359360
- SupportManualURL: !FindInMap [StaticVariables, Urls, SupportManualURL]
360-
ActionsEnabled: true
361-
AlarmActions:
362-
# - !ImportValue core-infrastructure-AlarmTopic # OJ-3243: turning off pager duty notifications while we are seeing false positives
363-
- !ImportValue platform-alarm-critical-alert-topic
364-
OKActions:
365-
# - !ImportValue core-infrastructure-AlarmTopic # OJ-3243: turning off pager duty notifications while we are seeing false positives
366-
- !ImportValue platform-alarm-critical-alert-topic
367-
InsufficientDataActions: []
368-
MetricName: jwt_verification_failed
369-
Namespace: !Sub "${CriIdentifier}"
370-
Statistic: Sum
371-
Dimensions:
372-
- Name: service
373-
Value: !Sub "${CriIdentifier}-sessionTS"
374-
Period: 300
361+
ComparisonOperator: GreaterThanThreshold
362+
Threshold: 10
375363
DatapointsToAlarm: 3
376364
EvaluationPeriods: 3
377-
Threshold: 1
378-
ComparisonOperator: GreaterThanThreshold
379365
TreatMissingData: notBreaching
366+
AlarmActions:
367+
- !ImportValue platform-alarm-warning-alert-topic
368+
OKActions:
369+
- !ImportValue platform-alarm-warning-alert-topic
370+
Metrics:
371+
- Id: errors
372+
Expression: IF(m2 != 0, (m1 / m2) * 100, 0)
373+
Label: JWTErrorRate
374+
ReturnData: true
375+
- Id: m1
376+
ReturnData: false
377+
MetricStat:
378+
Metric:
379+
Namespace: !Sub "${CriIdentifier}"
380+
MetricName: jwt_verification_failed
381+
Dimensions:
382+
- Name: service
383+
Value: !Sub "${CriIdentifier}-sessionTS"
384+
Period: 300
385+
Stat: Sum
386+
- Id: m2
387+
ReturnData: false
388+
MetricStat:
389+
Metric:
390+
Namespace: AWS/Lambda
391+
MetricName: Invocations
392+
Dimensions:
393+
- Name: FunctionName
394+
Value: !Sub ${CommonStackName}-SessionFunctionTS
395+
Period: 300
396+
Stat: Sum
380397

381-
TokenLambdaFailedToVerifyJWTAlarm:
398+
SessionLambdaFailedToVerifyJWTCriticalAlarm:
382399
Type: AWS::CloudWatch::Alarm
400+
Condition: DeployAlarms
383401
Properties:
384402
AlarmDescription: !Sub
385-
- "Errors verifying JWTs that have been been received by the token lambda. Runbook: ${SupportManualURL}"
403+
- "Errors verifying JWTs (jwt_verification_failed) rate exceeds 80% of Session Lambda invocations consecutively for 3, 5 minute periods. Runbook: ${SupportManualURL}"
386404
- SupportManualURL: !FindInMap [StaticVariables, Urls, SupportManualURL]
387-
ActionsEnabled: true
405+
ComparisonOperator: GreaterThanThreshold
406+
Threshold: 80
407+
DatapointsToAlarm: 3
408+
EvaluationPeriods: 3
409+
TreatMissingData: notBreaching
388410
AlarmActions:
389-
# - !ImportValue core-infrastructure-AlarmTopic # OJ-3243: turning off pager duty notifications while we are seeing false positives
411+
- !ImportValue core-infrastructure-AlarmTopic
390412
- !ImportValue platform-alarm-critical-alert-topic
391413
OKActions:
392-
# - !ImportValue core-infrastructure-AlarmTopic # OJ-3243: turning off pager duty notifications while we are seeing false positives
414+
- !ImportValue core-infrastructure-AlarmTopic
393415
- !ImportValue platform-alarm-critical-alert-topic
394-
InsufficientDataActions: []
395-
MetricName: jwt_verification_failed
396-
Namespace: !Sub "${CriIdentifier}"
397-
Statistic: Sum
398-
Dimensions:
399-
- Name: service
400-
Value: !Sub "${CriIdentifier}-access-token-2"
401-
Period: 300
416+
Metrics:
417+
- Id: errors
418+
Expression: IF(m2 != 0, (m1 / m2) * 100, 0)
419+
Label: JWTErrorRate
420+
ReturnData: true
421+
- Id: m1
422+
ReturnData: false
423+
MetricStat:
424+
Metric:
425+
Namespace: !Sub "${CriIdentifier}"
426+
MetricName: jwt_verification_failed
427+
Dimensions:
428+
- Name: service
429+
Value: !Sub "${CriIdentifier}-sessionTS"
430+
Period: 300
431+
Stat: Sum
432+
- Id: m2
433+
ReturnData: false
434+
MetricStat:
435+
Metric:
436+
Namespace: AWS/Lambda
437+
MetricName: Invocations
438+
Dimensions:
439+
- Name: FunctionName
440+
Value: !Sub ${CommonStackName}-SessionFunctionTS
441+
Period: 300
442+
Stat: Sum
443+
444+
TokenLambdaFailedToVerifyJWTWarningAlarm:
445+
Type: AWS::CloudWatch::Alarm
446+
Condition: DeployAlarms
447+
Properties:
448+
AlarmDescription: !Sub
449+
- "Errors verifying JWTs (jwt_verification_failed) rate exceeds 10% of Token Lambda invocations consecutively for 3, 5 minute periods. Runbook: ${SupportManualURL}"
450+
- SupportManualURL: !FindInMap [StaticVariables, Urls, SupportManualURL]
451+
ComparisonOperator: GreaterThanThreshold
452+
Threshold: 10
402453
DatapointsToAlarm: 3
403454
EvaluationPeriods: 3
404-
Threshold: 1
455+
TreatMissingData: notBreaching
456+
AlarmActions:
457+
- !ImportValue platform-alarm-warning-alert-topic
458+
OKActions:
459+
- !ImportValue platform-alarm-warning-alert-topic
460+
Metrics:
461+
- Id: errors
462+
Expression: IF(m2 != 0, (m1 / m2) * 100, 0)
463+
Label: JWTErrorRate
464+
ReturnData: true
465+
- Id: m1
466+
ReturnData: false
467+
MetricStat:
468+
Metric:
469+
Namespace: !Sub "${CriIdentifier}"
470+
MetricName: jwt_verification_failed
471+
Dimensions:
472+
- Name: service
473+
Value: !Sub "${CriIdentifier}-access-token-2"
474+
Period: 300
475+
Stat: Sum
476+
- Id: m2
477+
ReturnData: false
478+
MetricStat:
479+
Metric:
480+
Namespace: AWS/Lambda
481+
MetricName: Invocations
482+
Dimensions:
483+
- Name: FunctionName
484+
Value: !Sub ${CommonStackName}-AccessTokenFunctionTS
485+
Period: 300
486+
Stat: Sum
487+
488+
TokenLambdaFailedToVerifyJWTCriticalAlarm:
489+
Type: AWS::CloudWatch::Alarm
490+
Condition: DeployAlarms
491+
Properties:
492+
AlarmDescription: !Sub
493+
- "Errors verifying JWTs (jwt_verification_failed) rate exceeds 80% of Token Lambda invocations consecutively for 3, 5 minute periods. Runbook: ${SupportManualURL}"
494+
- SupportManualURL: !FindInMap [StaticVariables, Urls, SupportManualURL]
405495
ComparisonOperator: GreaterThanThreshold
496+
Threshold: 80
497+
DatapointsToAlarm: 3
498+
EvaluationPeriods: 3
406499
TreatMissingData: notBreaching
500+
AlarmActions:
501+
- !ImportValue core-infrastructure-AlarmTopic
502+
- !ImportValue platform-alarm-critical-alert-topic
503+
OKActions:
504+
- !ImportValue core-infrastructure-AlarmTopic
505+
- !ImportValue platform-alarm-critical-alert-topic
506+
Metrics:
507+
- Id: errors
508+
Expression: IF(m2 != 0, (m1 / m2) * 100, 0)
509+
Label: JWTErrorRate
510+
ReturnData: true
511+
- Id: m1
512+
ReturnData: false
513+
MetricStat:
514+
Metric:
515+
Namespace: !Sub "${CriIdentifier}"
516+
MetricName: jwt_verification_failed
517+
Dimensions:
518+
- Name: service
519+
Value: !Sub "${CriIdentifier}-access-token-2"
520+
Period: 300
521+
Stat: Sum
522+
- Id: m2
523+
ReturnData: false
524+
MetricStat:
525+
Metric:
526+
Namespace: AWS/Lambda
527+
MetricName: Invocations
528+
Dimensions:
529+
- Name: FunctionName
530+
Value: !Sub ${CommonStackName}-AccessTokenFunctionTS
531+
Period: 300
532+
Stat: Sum
407533

408534
CheckHmrcLambdaConcurrency80Alarm:
409535
Type: AWS::CloudWatch::Alarm

0 commit comments

Comments
 (0)