Skip to content

Commit 3b78c2d

Browse files
committed
Add dashboard panels and alert for scan monitoring
This commit adds metric definitions to the Grafana dashboard for all new Prometheus metrics: - FULL_SCAN_ELAPSED, SCAN_COUNT (conductor) - BUCKET_PROCESSOR_SCAN_START_TIME, BUCKET_PROCESSOR_BUCKETS_COUNT we're also adding four new dashboard panels in the Lifecycle Conductor section: - Conductor Full Scan Elapsed Time - Conductor Scan Counts (buckets, lifecycle buckets, workflows) - Bucket Processor Scan Progress (per-pod scan-in-progress state) - Bucket Processor Lifecycle Buckets (per-pod processed count) A LifecycleScanDurationHigh warning alert that fires when a conductor scan exceeds 1 hour was also added. Issue: BB-740
1 parent 4bf34e2 commit 3b78c2d

File tree

2 files changed

+101
-0
lines changed

2 files changed

+101
-0
lines changed

monitoring/lifecycle/alerts.yaml

Lines changed: 15 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -82,6 +82,21 @@ groups:
8282
{{ ${lifecycle_latency_critical_threshold} | humanizeDuration }} ago.
8383
summary: "Lifecycle scan not executed in time"
8484

85+
- alert: LifecycleScanDurationHigh
86+
Expr: |
87+
s3_lifecycle_conductor_full_scan_elapsed_seconds{
88+
namespace="${namespace}", job="${job_lifecycle_producer}"
89+
} > 3600
90+
For: "0s"
91+
Labels:
92+
severity: warning
93+
Annotations:
94+
zenko_service: backbeat-lifecycle-producer
95+
description: >-
96+
The latest lifecycle conductor full scan took more than 1 hour
97+
({{ $value | humanizeDuration }}).
98+
summary: "Lifecycle conductor full scan duration is high"
99+
85100
- name: LifecycleBucketProcessor
86101
rules:
87102

monitoring/lifecycle/dashboard.py

Lines changed: 86 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -73,6 +73,26 @@ class Metrics:
7373
'status', job='${job_lifecycle_producer}', namespace='${namespace}',
7474
)
7575

76+
FULL_SCAN_ELAPSED = metrics.Metric(
77+
's3_lifecycle_conductor_full_scan_elapsed_seconds',
78+
job='${job_lifecycle_producer}', namespace='${namespace}',
79+
)
80+
81+
SCAN_COUNT = metrics.Metric(
82+
's3_lifecycle_conductor_scan_count',
83+
'type', job='${job_lifecycle_producer}', namespace='${namespace}',
84+
)
85+
86+
BUCKET_PROCESSOR_SCAN_START_TIME = metrics.Metric(
87+
's3_lifecycle_bucket_processor_scan_start_time',
88+
job='${job_lifecycle_bucket_processor}', namespace='${namespace}',
89+
)
90+
91+
BUCKET_PROCESSOR_BUCKETS_COUNT = metrics.Metric(
92+
's3_lifecycle_bucket_processor_buckets_count',
93+
job='${job_lifecycle_bucket_processor}', namespace='${namespace}',
94+
)
95+
7696
S3_OPS = metrics.CounterMetric(
7797
's3_lifecycle_s3_operations_total',
7898
'origin', 'op', 'status', job=['$jobs'], namespace='${namespace}',
@@ -730,6 +750,70 @@ def color_override(name, color):
730750
]
731751
)
732752

753+
lifecycle_full_scan_elapsed = TimeSeries(
754+
title="Conductor Full Scan Elapsed Time",
755+
dataSource="${DS_PROMETHEUS}",
756+
legendDisplayMode="hidden",
757+
unit=UNITS.SECONDS,
758+
targets=[
759+
Target(
760+
expr=Metrics.FULL_SCAN_ELAPSED(),
761+
legendFormat='Elapsed',
762+
),
763+
],
764+
)
765+
766+
lifecycle_scan_count = TimeSeries(
767+
title="Conductor Scan Counts",
768+
dataSource="${DS_PROMETHEUS}",
769+
fillOpacity=20,
770+
lineInterpolation="smooth",
771+
targets=[
772+
Target(
773+
expr=Metrics.SCAN_COUNT(type='bucket'),
774+
legendFormat='Buckets',
775+
),
776+
Target(
777+
expr=Metrics.SCAN_COUNT(type='lifecycle_bucket'),
778+
legendFormat='Lifecycle Buckets',
779+
),
780+
Target(
781+
expr=Metrics.SCAN_COUNT(type='workflow'),
782+
legendFormat='Workflows',
783+
),
784+
],
785+
)
786+
787+
bucket_processor_scan_progress = TimeSeries(
788+
title="Bucket Processor Scan Progress",
789+
description="Shows whether each bucket processor is actively "
790+
"processing a scan. Non-zero = scan in progress (value is "
791+
"the scan start timestamp). When all pods reset to 0, the "
792+
"end-to-end scan is complete.",
793+
dataSource="${DS_PROMETHEUS}",
794+
lineInterpolation="smooth",
795+
targets=[
796+
Target(
797+
expr=Metrics.BUCKET_PROCESSOR_SCAN_START_TIME() + ' > 0',
798+
legendFormat='{{pod}}',
799+
),
800+
],
801+
)
802+
803+
bucket_processor_buckets = TimeSeries(
804+
title="Bucket Processor Lifecycle Buckets",
805+
description="Number of lifecycle-enabled buckets processed by "
806+
"each bucket processor during the current scan.",
807+
dataSource="${DS_PROMETHEUS}",
808+
lineInterpolation="smooth",
809+
targets=[
810+
Target(
811+
expr=Metrics.BUCKET_PROCESSOR_BUCKETS_COUNT(),
812+
legendFormat='{{pod}}',
813+
),
814+
],
815+
)
816+
733817
active_indexing_jobs = TimeSeries(
734818
title="Active Indexing jobs",
735819
dataSource="${DS_PROMETHEUS}",
@@ -897,6 +981,8 @@ def color_override(name, color):
897981
layout.row([s3_delete_object_ops, s3_delete_mpu_ops], height=8),
898982
RowPanel(title="Lifecycle Conductor"),
899983
layout.row([lifecycle_scans, trigger_latency], height=7),
984+
layout.row([lifecycle_full_scan_elapsed, lifecycle_scan_count], height=7),
985+
layout.row([bucket_processor_scan_progress, bucket_processor_buckets], height=7),
900986
layout.row([lifecycle_scan_rate, active_indexing_jobs, legacy_tasks], height=7),
901987
]),
902988
)

0 commit comments

Comments
 (0)