Skip to content

Commit 1302ddd

Browse files
fix: Typo alert cosmos_biz_db_provisioned_throughput_exceeded (#3606)
* fix-typo-alert-name-cosmos_biz_db_provisioned_throughput_exceeded * disable cosmos_biz_db_normalized_ru_exceeded vs 429 errors * add comment * fixo typo
1 parent 6a4b017 commit 1302ddd

File tree

2 files changed

+111
-2
lines changed

2 files changed

+111
-2
lines changed

src/domains/bizevents-common/03_cosmosdb_bizevents_datastore.tf

Lines changed: 9 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -213,6 +213,7 @@ resource "azurerm_monitor_metric_alert" "cosmos_biz_db_normalized_ru_exceeded" {
213213
window_size = "PT5M"
214214
frequency = "PT5M"
215215
auto_mitigate = false
216+
enabled = false # TODO disabled TMP
216217

217218

218219
# Metric info
@@ -247,10 +248,16 @@ resource "azurerm_monitor_metric_alert" "cosmos_biz_db_normalized_ru_exceeded" {
247248
tags = module.tag_config.tags
248249
}
249250

251+
252+
# In general, for a production workload, if you see between 1-5% of requests with 429s,
253+
# and your end-to-end latency is acceptable, this is a healthy sign that the RU/s are being fully utilized.
254+
# In this case, the normalized RU consumption metric reaching 100% only means that in a given second,
255+
# at least one partition key range used all its provisioned throughput.
256+
# This is acceptable because the overall rate of 429s is still low. No further action is required.
250257
resource "azurerm_monitor_metric_alert" "cosmos_biz_db_provisioned_throughput_exceeded" { # https://github.com/pagopa/terraform-azurerm-v3/blob/58f14dc120e10bd3515bcc34e0685e74d1d11047/cosmosdb_account/main.tf#L205
251258
count = var.env_short == "p" ? 1 : 0
252259

253-
name = "[${var.domain != null ? "${var.domain} | " : ""}${module.bizevents_datastore_cosmosdb_account.name}] 409 Throttling Errors Exceeded"
260+
name = "[${var.domain != null ? "${var.domain} | " : ""}${module.bizevents_datastore_cosmosdb_account.name}] 429 Throttling Errors Exceeded"
254261
resource_group_name = azurerm_resource_group.bizevents_rg.name
255262
scopes = [module.bizevents_datastore_cosmosdb_account.id]
256263
description = "A collection throughput (RU/s) exceed provisioned throughput, and it's raising 429 errors. Please, consider to increase RU. Runbook: not needed."
@@ -267,7 +274,7 @@ resource "azurerm_monitor_metric_alert" "cosmos_biz_db_provisioned_throughput_ex
267274
metric_name = "TotalRequestUnits"
268275
aggregation = "Total"
269276
operator = "GreaterThan"
270-
threshold = 1
277+
threshold = 100 # https://learn.microsoft.com/en-us/azure/cosmos-db/monitor-normalized-request-units?utm_source=chatgpt.com#what-to-expect-and-do-when-normalized-rus-is-higher
271278
skip_metric_validation = false
272279

273280
dimension {

src/domains/bizevents-secrets/.terraform.lock.hcl

Lines changed: 102 additions & 0 deletions
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

0 commit comments

Comments
 (0)