Skip to content

Commit eb23546

Browse files
committed
Support monitor period variables for all metrics
1 parent 41e82ab commit eb23546

4 files changed

Lines changed: 90 additions & 13 deletions

File tree

Makefile

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,5 @@
11
SHELL := /bin/bash
2-
TERRAFORM_VERSION ?= 1.0.1
2+
TERRAFORM_VERSION ?= 1.0.5
33

44
-include $(shell curl -sSL -o .build-harness "https://git.io/build-harness"; echo .build-harness)
55

README.md

Lines changed: 11 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -102,6 +102,17 @@ module "es_alarms" {
102102
| `monitor_kms` | Enable monitoring of KMS-related metrics, enable if using KMS | bool | `false` | no |
103103
| `monitor_master_cpu_utilization_too_high` | Enable monitoring of CPU utilization of master nodes are too high. Only enable this when dedicated master is enabled | bool | `false` | no |
104104
| `monitor_master_jvm_memory_pressure_too_high` | Enable monitoring of JVM memory pressure of master nodes are too high. Only enable this wwhen dedicated master is enabled | bool | `false` | no |
105+
| `monitor_min_available_nodes_period` | The period of the minimum available nodes should the statistics be applied in seconds | string | `86400` | no |
106+
| `monitor_automated_snapshot_failure_period` | The period of the automated snapshot failure should the statistics be applied in seconds | string | `60` | no |
107+
| `monitor_cluster_index_writes_blocked_period` | The period of the cluster index writes being blocked should the statistics be applied in seconds | string | `300` | no |
108+
| `monitor_cluster_status_is_red_period` | The period of the cluster status is in red should the statistics be applied in seconds | string | `60` | no |
109+
| `monitor_cluster_status_is_yellow_period` | The period of the cluster status is in yellow should the statistics be applied in seconds | string | `60` | no |
110+
| `monitor_cpu_utilization_too_high_period` | The period of the CPU utilization is too high should the statistics be applied in seconds | string | `900` | no |
111+
| `monitor_free_storage_space_too_low_period` | The period of the cluster average free storage is too low should the statistics be applied in seconds | string | `60` | no |
112+
| `monitor_jvm_memory_pressure_too_high_period` | The period of the JVM memory pressure is too high should the statistics be applied in seconds | string | `900` | no |
113+
| `monitor_kms_period` | The period of the KMS-related metrics should the statistics be applied in seconds | string | `60` | no |
114+
| `monitor_master_cpu_utilization_too_high_period` | The period of the CPU utilization of master nodes are too high should the statistics be applied in seconds | string | `900` | no |
115+
| `monitor_master_jvm_memory_pressure_too_high_period` | The period of the JVM memory pressure of master nodes are too high should the statistics be applied in seconds | string | `900` | no |
105116
| `create_sns_topic` | Will create an SNS topic, if you set this to false you MUST set `sns_topic` to a FULL ARN | bool | `true` | no |
106117
| `sns_topic` | SNS topic you want to specify. If leave empty, it will use a prefix and a timestamp appended. If `create_sns_topic` is set to false, this MUST be a FULL ARN | string | `""` | no |
107118
| `sns_topic_postfix` | SNS topic postfix | string | `""` | no |

alarms.tf

Lines changed: 12 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -16,7 +16,7 @@ resource "aws_cloudwatch_metric_alarm" "cluster_status_is_red" {
1616
evaluation_periods = "1"
1717
metric_name = "ClusterStatus.red"
1818
namespace = "AWS/ES"
19-
period = "60"
19+
period = var.monitor_cluster_status_is_red_period
2020
statistic = "Maximum"
2121
threshold = "1"
2222
alarm_description = "Average elasticsearch cluster status is in red over last 1 minute"
@@ -38,7 +38,7 @@ resource "aws_cloudwatch_metric_alarm" "cluster_status_is_yellow" {
3838
evaluation_periods = var.alarm_cluster_status_is_yellow_periods
3939
metric_name = "ClusterStatus.yellow"
4040
namespace = "AWS/ES"
41-
period = "60"
41+
period = var.monitor_cluster_status_is_yellow_period
4242
statistic = "Maximum"
4343
threshold = "1"
4444
alarm_description = "Average elasticsearch cluster status is in yellow over last ${var.alarm_cluster_status_is_yellow_periods} minute(s)"
@@ -60,7 +60,7 @@ resource "aws_cloudwatch_metric_alarm" "free_storage_space_too_low" {
6060
evaluation_periods = var.alarm_free_storage_space_too_low_periods
6161
metric_name = "FreeStorageSpace"
6262
namespace = "AWS/ES"
63-
period = "60"
63+
period = var.monitor_free_storage_space_too_low_period
6464
statistic = "Minimum"
6565
threshold = local.thresholds["FreeStorageSpaceThreshold"]
6666
alarm_description = "Average elasticsearch free storage space over last ${var.alarm_free_storage_space_too_low_periods} minute(s) is too low"
@@ -82,7 +82,7 @@ resource "aws_cloudwatch_metric_alarm" "cluster_index_writes_blocked" {
8282
evaluation_periods = "1"
8383
metric_name = "ClusterIndexWritesBlocked"
8484
namespace = "AWS/ES"
85-
period = "300"
85+
period = var.monitor_cluster_index_writes_blocked_period
8686
statistic = "Maximum"
8787
threshold = "1"
8888
alarm_description = "Elasticsearch index writes being blocker over last 5 minutes"
@@ -104,7 +104,7 @@ resource "aws_cloudwatch_metric_alarm" "insufficient_available_nodes" {
104104
evaluation_periods = "1"
105105
metric_name = "Nodes"
106106
namespace = "AWS/ES"
107-
period = "86400"
107+
period = var.monitor_min_available_nodes_period
108108
statistic = "Minimum"
109109
threshold = local.thresholds["MinimumAvailableNodes"]
110110
alarm_description = "Elasticsearch nodes minimum < ${local.thresholds["MinimumAvailableNodes"]} for 1 day"
@@ -126,7 +126,7 @@ resource "aws_cloudwatch_metric_alarm" "automated_snapshot_failure" {
126126
evaluation_periods = "1"
127127
metric_name = "AutomatedSnapshotFailure"
128128
namespace = "AWS/ES"
129-
period = "60"
129+
period = var.monitor_automated_snapshot_failure_period
130130
statistic = "Maximum"
131131
threshold = "1"
132132
alarm_description = "Elasticsearch automated snapshot failed over last 1 minute"
@@ -148,7 +148,7 @@ resource "aws_cloudwatch_metric_alarm" "cpu_utilization_too_high" {
148148
evaluation_periods = "3"
149149
metric_name = "CPUUtilization"
150150
namespace = "AWS/ES"
151-
period = "900"
151+
period = var.monitor_cpu_utilization_too_high_period
152152
statistic = "Average"
153153
threshold = local.thresholds["CPUUtilizationThreshold"]
154154
alarm_description = "Average elasticsearch cluster CPU utilization over last 45 minutes too high"
@@ -169,7 +169,7 @@ resource "aws_cloudwatch_metric_alarm" "jvm_memory_pressure_too_high" {
169169
evaluation_periods = "1"
170170
metric_name = "JVMMemoryPressure"
171171
namespace = "AWS/ES"
172-
period = "900"
172+
period = var.monitor_jvm_memory_pressure_too_high_period
173173
statistic = "Maximum"
174174
threshold = local.thresholds["JVMMemoryPressureThreshold"]
175175
alarm_description = "Elasticsearch JVM memory pressure is too high over last 15 minutes"
@@ -190,7 +190,7 @@ resource "aws_cloudwatch_metric_alarm" "master_cpu_utilization_too_high" {
190190
evaluation_periods = "3"
191191
metric_name = "MasterCPUUtilization"
192192
namespace = "AWS/ES"
193-
period = "900"
193+
period = var.monitor_master_cpu_utilization_too_high_period
194194
statistic = "Average"
195195
threshold = local.thresholds["MasterCPUUtilizationThreshold"]
196196
alarm_description = "Average elasticsearch cluster CPU utilization over last 45 minutes too high"
@@ -211,7 +211,7 @@ resource "aws_cloudwatch_metric_alarm" "master_jvm_memory_pressure_too_high" {
211211
evaluation_periods = "1"
212212
metric_name = "MasterJVMMemoryPressure"
213213
namespace = "AWS/ES"
214-
period = "900"
214+
period = var.monitor_master_jvm_memory_pressure_too_high_period
215215
statistic = "Maximum"
216216
threshold = local.thresholds["MasterJVMMemoryPressureThreshold"]
217217
alarm_description = "Elasticsearch JVM memory pressure is too high over last 15 minutes"
@@ -232,7 +232,7 @@ resource "aws_cloudwatch_metric_alarm" "kms_key_error" {
232232
evaluation_periods = "1"
233233
metric_name = "KMSKeyError"
234234
namespace = "AWS/ES"
235-
period = "60"
235+
period = var.monitor_kms_period
236236
statistic = "Maximum"
237237
threshold = "1"
238238
alarm_description = "Elasticsearch KMS Key Error failed over last 1 minute"
@@ -254,7 +254,7 @@ resource "aws_cloudwatch_metric_alarm" "kms_key_inaccessible" {
254254
evaluation_periods = "1"
255255
metric_name = "KMSKeyInaccessible"
256256
namespace = "AWS/ES"
257-
period = "60"
257+
period = var.monitor_kms_period
258258
statistic = "Maximum"
259259
threshold = "1"
260260
alarm_description = "Elasticsearch KMS Key Inaccessible failed over last 1 minute"

variables.tf

Lines changed: 66 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -99,6 +99,72 @@ variable "monitor_master_jvm_memory_pressure_too_high" {
9999
default = false
100100
}
101101

102+
variable "monitor_min_available_nodes_period" {
103+
description = "The period of the minimum available nodes should the statistics be applied in seconds"
104+
type = string
105+
default = "86400"
106+
}
107+
108+
variable "monitor_cluster_status_is_red_period" {
109+
description = "The period of the cluster status is in red should the statistics be applied in seconds"
110+
type = string
111+
default = "60"
112+
}
113+
114+
variable "monitor_cluster_status_is_yellow_period" {
115+
description = "The period of the cluster status is in yellow should the statistics be applied in seconds"
116+
type = string
117+
default = "60"
118+
}
119+
120+
variable "monitor_free_storage_space_too_low_period" {
121+
description = "The period of the cluster average free storage is too low should the statistics be applied in seconds"
122+
type = string
123+
default = "60"
124+
}
125+
126+
variable "monitor_cluster_index_writes_blocked_period" {
127+
description = "The period of the cluster index writes being blocked should the statistics be applied in seconds"
128+
type = string
129+
default = "300"
130+
}
131+
132+
variable "monitor_automated_snapshot_failure_period" {
133+
description = "The period of the automated snapshot failure should the statistics be applied in seconds"
134+
type = string
135+
default = "60"
136+
}
137+
138+
variable "monitor_cpu_utilization_too_high_period" {
139+
description = "The period of the CPU utilization is too high should the statistics be applied in seconds"
140+
type = string
141+
default = "900"
142+
}
143+
144+
variable "monitor_jvm_memory_pressure_too_high_period" {
145+
description = "The period of the JVM memory pressure is too high should the statistics be applied in seconds"
146+
type = string
147+
default = "900"
148+
}
149+
150+
variable "monitor_kms_period" {
151+
description = "The period of the KMS-related metrics should the statistics be applied in seconds"
152+
type = string
153+
default = "60"
154+
}
155+
156+
variable "monitor_master_cpu_utilization_too_high_period" {
157+
description = "The period of the CPU utilization of master nodes are too high should the statistics be applied in seconds"
158+
type = string
159+
default = "900"
160+
}
161+
162+
variable "monitor_master_jvm_memory_pressure_too_high_period" {
163+
description = "The period of the JVM memory pressure of master nodes are too high should the statistics be applied in seconds"
164+
type = string
165+
default = "900"
166+
}
167+
102168
variable "free_storage_space_threshold" {
103169
description = "The minimum amount of available storage space in MegaByte."
104170
type = number

0 commit comments

Comments
 (0)