-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathalert_suppression.lua
320 lines (295 loc) · 12.5 KB
/
alert_suppression.lua
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
local SETTINGS = {}
local USE_SYSTEM_TIME = 0
local lastAlertTimestamps = {} -- table to store the last alert timestamp for each alert group
local alertCounts = {} -- table to store alert counts within COUNT_INTERVAL
local first_event_flag = 1
local stmpFieldName
-- Logging context name
local LOGGING_CONTEXT = "DF.ESP.CUSTOM.CV_ANNOTATION"
-- Initialization function
function init(settings)
SETTINGS = settings
-- preprocessing
stmpFieldName = SETTINGS["ALERT_TIME_FIELD"]
end
-- Helper function to clean up old alerts beyond COUNT_INTERVAL
local function cleanup_old_alerts(alert_group)
local current_time = os.time()
local threshold_time = current_time - tonumber(SETTINGS["COUNT_INTERVAL"])
if alertCounts[alert_group] then
local new_alerts = {}
for _, timestamp in ipairs(alertCounts[alert_group]) do
if timestamp >= threshold_time then
table.insert(new_alerts, timestamp)
end
end
alertCounts[alert_group] = new_alerts
end
end
function create(data, context)
local event = {}
local current_stmp
-- Get the current timestamp from the alert or using system time
if data[stmpFieldName] then
if first_event_flag == 1 then
esp_logMessage(LOGGING_CONTEXT, "Field with name: '" .. tostring(stmpFieldName) .. "' exists in the input event schema, and will be used as timestamp for suppression", "info")
first_event_flag = 0
end
current_stmp = tonumber(data[stmpFieldName])
else
if first_event_flag == 1 then
esp_logMessage(LOGGING_CONTEXT, "Field with name: '" .. tostring(stmpFieldName) .. "' doesn't exist in the input event schema, system time will be used instead", "info")
first_event_flag = 0
end
USE_SYSTEM_TIME = 1
current_stmp = esp_getSystemMicro()
end
-- Alerts suppression logic
local alert_group = data.alert_group or "default"
if not alertCounts[alert_group] then
alertCounts[alert_group] = {}
end
-- Add the current alert timestamp to the group
table.insert(alertCounts[alert_group], current_stmp)
-- Cleanup old alerts outside of COUNT_INTERVAL
if tonumber(SETTINGS["COUNT_INTERVAL"]) > 0 then
cleanup_old_alerts(alert_group)
end
-- Suppression logic based on MIN_COUNT and COUNT_INTERVAL
if tonumber(SETTINGS["MIN_COUNT"]) > 0 and #alertCounts[alert_group] < tonumber(SETTINGS["MIN_COUNT"]) then
event.alert_suppressed = 1
if SETTINGS["LOG_SUPPRESSED_EVENTS"] == "1" then
esp_logMessage(LOGGING_CONTEXT, "Alert ID="..data.alert_id.." suppressed for group:'" .. alert_group .. "'; Reason: low alert frequency '" .. #alertCounts[alert_group] .. "' where expected is '"..SETTINGS["MIN_COUNT"] .. "' for the COUNT_INTERVAL = " ..SETTINGS["COUNT_INTERVAL"].." seconds", "info")
end
else
local last_timestamp = lastAlertTimestamps[alert_group]
if last_timestamp and ((current_stmp - last_timestamp) / 1e6) < tonumber(SETTINGS["SUPPRESSION_PERIOD"]) then
-- Suppress the alert
event.alert_suppressed = 1
if SETTINGS["LOG_SUPPRESSED_EVENTS"] == "1" then
esp_logMessage(LOGGING_CONTEXT, "Alert ID="..data.alert_id.." suppressed for group:'" .. alert_group .. "'; Reason: already was sent alert at ".. os.date("%Y-%m-%d %H:%M:%S", (last_timestamp/1e6)) .." in the SUPPRESSION_PERIOD = " .. SETTINGS["SUPPRESSION_PERIOD"].." seconds", "info")
end
else
-- Update the timestamp for this group
lastAlertTimestamps[alert_group] = current_stmp
event.alert_suppressed = 0
end
end
-- Output the event
event.alert_id = data.alert_id
event.alert_group = alert_group
event.alert_stmp = current_stmp
if SETTINGS["OUTPUT_SUPPRESSED_EVENTS"] == "1" or event.alert_suppressed == 0 then
return event
end
end
_espconfig_ = {
settings = {
desc = "",
expand_parms = false,
process_blocks = false,
encode_binary = false
},
inputVariables = {
desc = "...",
fields = {
{
name = "alert_id",
desc = "unique alert key, will be a key in the output schema",
optional = false
},
{
name = "alert_group",
desc = "if missing default group will be assigned and warning message will be sent to the log",
optional = true
},
{
name = "alert_stmp",
desc = "can be set in SETTINGS[ALERT_TIME_FIELD], if field is missing system time will be used instead and info message will be sent to the log",
optional = true
},
{
name = "alert_group_stmp",
desc = "only required when SETTINGS[USE_EXTERNAL_CACHE] = 1 ",
optional = true
}
}
},
outputVariables = {
desc = "...",
fields = {
{
name = "alert_id",
desc = "propagated from input"
},
{
name = "alert_group",
desc = "propagated from input"
},
{
name = "alert_stmp",
desc = "propagated from input"
},
{
name = "alert_suppressed",
desc = "suppression flag"
}
}
},
initialization = {
desc = "...",
fields = {
{
name = "ALERT_TIME_FIELD",
desc = "The name of a time field (string) from the input event schema to use for calculating the suppression period. If not set or the field name does not exist in the event metadata, the system time will be used instead.",
default = "alert_stmp"
},
{
name = "SUPPRESSION_PERIOD",
desc = "An integer number of seconds after which all alerts should be suppressed after the first. The period works for unique alert_group. When the period ends, the first new event passes, and the suppressed period is renewed.",
default = "10"
},
{
name = "USE_EXTERNAL_CACHE",
desc = "Where to store the suppression period state. If set to 0, the state will be stored only in the Lua window. In this case, we do not support HA and autoscaling in K8ts for the project. If set to 1, the current state will come as the alert_group_stmp field from the input window, and the project will be stateless and autoscalable. The cache can be stored in any persistent storage, we recommend implementing the cache using Redis (see the project template)",
default = "0"
},
{
name = "OUTPUT_SUPPRESSED_EVENTS",
desc = "Output suppressed events from the window (with alert_suppressed = 1 ) ",
default = "1"
},
{
name = "LOG_SUPPRESSED_EVENTS",
desc = "Output every suppressed events to the pod log",
default = "1"
},
{
name = "MIN_COUNT",
desc = "Allowed minimum number of alerts in a given alert_group in the last COUNT_INTERVAL seconds.A value of '0' or no setting means that this mode is disabled.",
default = "2"
},
{
name = "COUNT_INTERVAL",
desc = "The number of seconds during which the number of alerts in the alert_group will be counted and compared to the MIN_COUNT. If it is less, the alert will be suppressed. A value of '0' or no setting means that this mode is disabled.",
default = "10"
}
}
}
}
--[[metadata start
{
"name": "Alert Suppression",
"description": "....",
"tags": [
"lua",
"test"
],
"versionNotes": "Added MIN_COUNT and COUNT_INTERVAL logic"
}
metadata end]]--
_espconfig_ = {
settings = {
desc = "",
expand_parms = false,
process_blocks = false,
encode_binary = false
},
inputVariables = {
desc = "...",
fields = {
{
name = "alert_id",
desc = "unique alert key, will be a key in the output schema",
optional = false
},
{
name = "alert_group",
desc = "if missing default group will be assigned and warning message will be sent to the log",
optional = true
},
{
name = "alert_stmp",
desc = "can be set in SETTINGS[ALERT_TIME_FIELD], if field is missing system time will be used instead and info message will be sent to the log",
optional = true
},
{
name = "alert_group_stmp",
desc = "only required when SETTINGS[USE_EXTERNAL_CACHE] = 1 ",
optional = true
}
}
},
outputVariables = {
desc = "...",
fields = {
{
name = "alert_id",
desc = "propagated from input"
},
{
name = "alert_group",
desc = "propagated from input"
},
{
name = "alert_stmp",
desc = "propagated from input"
},
{
name = "alert_suppressed",
desc = "suppression flag"
}
}
},
initialization = {
desc = "...",
fields = {
{
name = "ALERT_TIME_FIELD",
desc = "The name of a time field (string) from the input event schema to use for calculating the suppression period. If not set or the field name does not exist in the event metadata, the system time will be used instead.",
default = "alert_stmp"
},
{
name = "SUPPRESSION_PERIOD",
desc = "An integer number of seconds after which all alerts should be suppressed after the first. The period works for unique alert_group. When the period ends, the first new event passes, and the suppressed period is renewed.",
default = "10"
},
{
name = "USE_EXTERNAL_CACHE",
desc = "Where to store the suppression period state. If set to 0, the state will be stored only in the Lua window. In this case, we do not support HA and autoscaling in K8ts for the project. If set to '1', the current state will come as the alert_group_stmp field from the input window, and the project will be stateless and autoscalable. The cache can be stored in any persistent storage, we recommend implementing the cache using Redis (see the project template)",
default = "0"
},
{
name = "OUTPUT_SUPPRESSED_EVENTS",
desc = "Output suppressed events from the window (with alert_suppressed = 1 ) ",
default = "1"
},
{
name = "LOG_SUPPRESSED_EVENTS",
desc = "Output every suppressed event to the pod log",
default = "1"
},
{
name = "MIN_COUNT",
desc = "Allowed minimum number of alerts in a given alert_group in the last COUNT_INTERVAL seconds.A value of '0' or no setting means that this mode is disabled.",
default = "2"
},
{
name = "COUNT_INTERVAL",
desc = "The number of seconds during which the number of alerts in the alert_group will be counted and compared to the MIN_COUNT. If it is less, the alert will be suppressed. A value of '0' or no setting means that this mode is disabled.",
default = "10"
}
}
}
}
--[[metadata start
{
"name": "Alert Suppression",
"description": "....",
"tags": [
"lua",
"test"
],
"versionNotes": "Added MIN_COUNT and COUNT_INTERVAL logic"
}
metadata end]]--