dubiety
diff --git a/‎.travis.yml‎
Lines changed: 16 additions & 0 deletions b/‎.travis.yml‎
Lines changed: 16 additions & 0 deletions
diff --git a/‎Makefile‎
Lines changed: 8 additions & 0 deletions b/‎Makefile‎
Lines changed: 8 additions & 0 deletions
diff --git a/‎README.md‎
Lines changed: 124 additions & 0 deletions b/‎README.md‎
Lines changed: 124 additions & 0 deletions
diff --git a/‎alarms.tf‎
Lines changed: 210 additions & 0 deletions b/‎alarms.tf‎
Lines changed: 210 additions & 0 deletions
@@ -0,0 +1,16 @@
+addons:
+  apt:
+    packages:
+      - git
+      - make
+      - curl
+
+install:
+  - make init
+
+script:
+  - make terraform/install
+  - make terraform/get-plugins
+  - make terraform/get-modules
+  - make terraform/lint
+  - make terraform/validate
@@ -0,0 +1,8 @@
+SHELL := /bin/bash
+TERRAFORM_VERSION ?= 0.11.11
+
+-include $(shell curl -sSL -o .build-harness "https://git.io/build-harness"; echo .build-harness)
+
+## Lint terraform code
+lint:
+	$(SELF) terraform/install terraform/get-modules terraform/get-plugins terraform/lint terraform/validate
@@ -0,0 +1,124 @@
+# terraform-aws-elasticsearch-cloudwatch-sns-alarms
+
+[![Build Status](https://travis-ci.org/dubiety/terraform-aws-elasticsearch-cloudwatch-sns-alarms.svg?branch=master)](https://travis-ci.org/dubiety/terraform-aws-elasticsearch-cloudwatch-sns-alarms)
+[![Latest Release](https://img.shields.io/github/release/dubiety/terraform-aws-elasticsearch-cloudwatch-sns-alarms.svg)](https://github.com/dubiety/terraform-aws-elasticsearch-cloudwatch-sns-alarms/releases)
+
+Terraform module that configures important elasticsearch alerts using CloudWatch and sends them to an SNS topic.
+
+Create a set of sane Elasticsearch CloudWatch alerts for monitoring the health of an elasticsearch cluster.
+
+This project is inspired by [CloudPosse](https://github.com/cloudposse)
+
+It's 100% Open Source and licensed under the [APACHE2](LICENSE).
+
+## Usage
+
+| area       | metric                    | comparison operator | threshold | rationale                                                                                                                                                                                                                                   |
+|------------|---------------------------|---------------------|-----------|---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|
+| Sharding   | ClusterStatus.red         | `>=`                | 1         | At least one primary shard and its replicas are not allocated to a node for 1 minute 1 consecutive time. See [Red Cluster Status](https://docs.aws.amazon.com/elasticsearch-service/latest/developerguide/aes-handling-errors.html#aes-handling-errors-red-cluster-status). |
+| Sharding   | ClusterStatus.yellow      | `>=`                | 1         | At least one replica shard is not allocated to a node for 1 minute 1 consecutive time. See [Yellow Cluster Status](https://docs.aws.amazon.com/elasticsearch-service/latest/developerguide/aes-handling-errors.html#aes-handling-errors-yellow-cluster-status).             |
+| Storage    | FreeStorageSpace          | `<=`                | 20480 MB  | A node in your cluster is down to 20 GiB of free storage space for 1 minute 1 consecutive time. See Lack of Available Storage Space. This value is in MiB, so rather than 20480, we recommend setting it to 25% of the storage space for each node.                         |
+| Storage    | ClusterIndexWritesBlocked | `>=`                | 1         | The cluster is blocking write requests for 5 minutes 1 consecutive time. See (ClusterBlockException)[https://docs.aws.amazon.com/elasticsearch-service/latest/developerguide/aes-handling-errors.html#troubleshooting-cluster-block]                                        |
+| Node Count | Nodes                     | `<`                 | `x`       | `x` is the number of nodes in your cluster. This alarm indicates that at least one node in your cluster has been unreachable for one day.                                                                                                                                   |
+| Snapshot   | AutomatedSnapshotFailure  | `>=`                | 1         | An automated snapshot failed for 1 minute 1 consecutive time. This failure is often the result of a red cluster health status. See [Red Cluster Status](https://docs.aws.amazon.com/elasticsearch-service/latest/developerguide/aes-handling-errors.html#aes-handling-errors-red-cluster-status). |
+| CPU        | CPUUtilization            | `>=`                | 80 %      | CPU utilization average is >= 80% for 15 minutes, 3 consecutive times for the node cluster.                                                                                                                                                                                 |
+| Memory     | JVMMemoryPressure         | `>=`                | 80 %      | JVMMemoryPressure maximum is >= 80% for 15 minutes, 1 consecutive time.                                                                                                                                                                                                     |
+| CPU        | MasterCPUUtilization      | `>=`                | 80 %      | Dedicated master nodes' CPU utilization is >= 80% for 15 minutes, 3 consecutive times.                                                                                                                                                                                      |
+| Memory     | MasterJVMMemoryPressure   | `>=`                | 80 %      | Dedicated master nodes' maximum JVM memory usage is >= 80% for 15 minutes, 1 consecutive time.                                                                                                                                                                              |
+
+## Examples
+
+See the [`examples/`](examples/) directory for working examples.
+
+```hcl
+resource "aws_elasticsearch_domain" "es" {
+  domain_name           = "example"
+  elasticsearch_version = "6.3"
+
+  cluster_config {
+    instance_type = "r4.large.elasticsearch"
+  }
+
+  snapshot_options {
+    automated_snapshot_start_hour = 23
+  }
+
+  tags = {
+    Domain = "TestDomain"
+  }
+}
+
+module "es_alarms" {
+  source         = "github::https://github.com/dubiety/terraform-aws-elasticsearch-cloudwatch-sns-alarms.git?ref=master"
+  domain_name    = "example"
+}
+```
+
+
+## Inputs
+
+| Name | Description | Type | Default | Required |
+|------|-------------|:----:|:-----:|:-----:|
+| alarm_name_postfix | Alarm name postfix | string | `""` | no |
+| alarm_name_prefix | Alarm name prefix | string | `""` | no |
+| cpu_utilization_threshold | The maximum percentage of CPU utilization | string | `80` | no |
+| domain_name | The Elasticserach domain name you want to monitor. | string | - | yes |
+| free_storage_space_threshold | The minimum amount of available storage space in Byte. | string | `21474836480` | no |
+| jvm_memory_pressure_threshold | The maximum percentage of the Java heap used for all data nodes in the cluster | string | `80` | no |
+| master_cpu_utilization_threshold | The maximum percentage of CPU utilization of master nodes | string | `""` | no |
+| master_jvm_memory_pressure_threshold | The maximum percentage of the Java heap used for master nodes in the cluster | string | `""` | no |
+| min_available_nodes | The minimum available (reachable) nodes to have | string | `1` | no |
+| monitor_automated_snapshot_failure | Enable monitoring of automated snapshot failure | string | `true` | no |
+| monitor_cluster_index_writes_blocked | Enable monitoring of cluster index writes being blocked | string | `true` | no |
+| monitor_cluster_status_is_red | Enable monitoring of cluster status is in red | string | `true` | no |
+| monitor_cluster_status_is_yellow | Enable monitoring of cluster status is in yellow | string | `true` | no |
+| monitor_cpu_utilization_too_high | Enable monitoring of CPU utilization is too high | string | `true` | no |
+| monitor_free_storage_space_too_low | Enable monitoring of cluster average free storage is to low | string | `true` | no |
+| monitor_insufficient_available_nodes | Enable monitoring insufficient available nodes | string | `false` | no |
+| monitor_jvm_memory_pressure_too_high | Enable monitoring of JVM memory pressure is too high | string | `true` | no |
+| monitor_master_cpu_utilization_too_high | Enable monitoring of CPU utilization of master nodes are too high. Only enable this when dedicated master is enabled | string | `false` | no |
+| monitor_master_jvm_memory_pressure_too_high | Enable monitoring of JVM memory pressure of master nodes are too high. Only enable this wwhen dedicated master is enabled | string | `false` | no |
+| sns_topic | SNS topic you want to specify. If leave empty, it will use a prefix and a timestampe appended | string | `""` | no |
+
+## Outputs
+
+| Name | Description |
+|------|-------------|
+| sns_topic_arn | The ARN of the SNS topic |
+
+## Share the Love
+
+Like this project? Please give it a ★ on [our GitHub](https://github.com/dubiety/terraform-aws-elasticsearch-cloudwatch-sns-alarms)!
+
+## Help
+
+**Got a question?**
+
+File a GitHub [issue](https://github.com/dubiety/terraform-aws-elasticsearch-cloudwatch-sns-alarms/issues).
+
+### Bug Reports & Feature Requests
+
+Please use the [issue tracker](https://github.com/dubiety/terraform-aws-elasticsearch-cloudwatch-sns-alarms/issues) to report any bugs or file feature requests.
+
+## License
+
+[![License](https://img.shields.io/badge/License-Apache%202.0-blue.svg)](https://opensource.org/licenses/Apache-2.0)
+
+See [LICENSE](LICENSE) for full details.
+
+    Licensed to the Apache Software Foundation (ASF) under one
+    or more contributor license agreements.  See the NOTICE file
+    distributed with this work for additional information
+    regarding copyright ownership.  The ASF licenses this file
+    to you under the Apache License, Version 2.0 (the
+    "License"); you may not use this file except in compliance
+    with the License.  You may obtain a copy of the License at
+
+      https://www.apache.org/licenses/LICENSE-2.0
+
+    Unless required by applicable law or agreed to in writing,
+    software distributed under the License is distributed on an
+    "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+    KIND, either express or implied.  See the License for the
+    specific language governing permissions and limitations
+    under the License.
@@ -0,0 +1,210 @@
+locals {
+  thresholds = {
+    FreeStorageSpaceThreshold        = "${max(var.free_storage_space_threshold, 0)}"
+    MinimumAvailableNodes            = "${max(var.min_available_nodes, 0)}"
+    CPUUtilizationThreshold          = "${min(max(var.cpu_utilization_threshold, 0), 100)}"
+    JVMMemoryPressureThreshold       = "${min(max(var.jvm_memory_pressure_threshold, 0), 100)}"
+    MasterCPUUtilizationThreshold    = "${min(max(coalesce(var.master_cpu_utilization_threshold, var.cpu_utilization_threshold), 0), 100)}"
+    MasterJVMMemoryPressureThreshold = "${min(max(coalesce(var.master_jvm_memory_pressure_threshold, var.jvm_memory_pressure_threshold), 0), 100)}"
+  }
+}
+
+resource "aws_cloudwatch_metric_alarm" "cluster_status_is_red" {
+  count               = "${var.monitor_cluster_status_is_red}"
+  alarm_name          = "${var.alarm_name_prefix}ElasticSearch-ClusterStatusIsRed${var.alarm_name_postfix}"
+  comparison_operator = "GreaterThanOrEqualToThreshold"
+  evaluation_periods  = "1"
+  metric_name         = "ClusterStatus.red"
+  namespace           = "AWS/ES"
+  period              = "60"
+  statistic           = "Average"
+  threshold           = "1"
+  alarm_description   = "Average elasticsearch cluster status is in red over last 5 minutes"
+  alarm_actions       = ["${local.aws_sns_topic_arn}"]
+  ok_actions          = ["${local.aws_sns_topic_arn}"]
+
+  dimensions {
+    DomainName = "${var.domain_name}"
+    ClientId   = "${data.aws_caller_identity.default.account_id}"
+  }
+}
+
+resource "aws_cloudwatch_metric_alarm" "cluster_status_is_yellow" {
+  count               = "${var.monitor_cluster_status_is_yellow}"
+  alarm_name          = "${var.alarm_name_prefix}ElasticSearch-ClusterStatusIsYellow${var.alarm_name_postfix}"
+  comparison_operator = "GreaterThanOrEqualToThreshold"
+  evaluation_periods  = "1"
+  metric_name         = "ClusterStatus.yellow"
+  namespace           = "AWS/ES"
+  period              = "60"
+  statistic           = "Average"
+  threshold           = "1"
+  alarm_description   = "Average elasticsearch cluster status is in yellow over last 5 minutes"
+  alarm_actions       = ["${local.aws_sns_topic_arn}"]
+  ok_actions          = ["${local.aws_sns_topic_arn}"]
+
+  dimensions {
+    DomainName = "${var.domain_name}"
+    ClientId   = "${data.aws_caller_identity.default.account_id}"
+  }
+}
+
+resource "aws_cloudwatch_metric_alarm" "free_storage_space_too_low" {
+  count               = "${var.monitor_free_storage_space_too_low}"
+  alarm_name          = "${var.alarm_name_prefix}ElasticSearch-FreeStorageSpaceTooLow${var.alarm_name_postfix}"
+  comparison_operator = "LessThanOrEqualToThreshold"
+  evaluation_periods  = "1"
+  metric_name         = "FreeStorageSpace"
+  namespace           = "AWS/ES"
+  period              = "60"
+  statistic           = "Average"
+  threshold           = "${local.thresholds["FreeStorageSpaceThreshold"]}"
+  alarm_description   = "Average elasticsearch free storage space over last 1 minutes is too low"
+  alarm_actions       = ["${local.aws_sns_topic_arn}"]
+  ok_actions          = ["${local.aws_sns_topic_arn}"]
+
+  dimensions {
+    DomainName = "${var.domain_name}"
+    ClientId   = "${data.aws_caller_identity.default.account_id}"
+  }
+}
+
+resource "aws_cloudwatch_metric_alarm" "cluster_index_writes_blocked" {
+  count               = "${var.monitor_cluster_index_writes_blocked}"
+  alarm_name          = "${var.alarm_name_prefix}ElasticSearch-ClusterIndexWritesBlocked${var.alarm_name_postfix}"
+  comparison_operator = "GreaterThanOrEqualToThreshold"
+  evaluation_periods  = "1"
+  metric_name         = "ClusterIndexWritesBlocked"
+  namespace           = "AWS/ES"
+  period              = "300"
+  statistic           = "Average"
+  threshold           = "1"
+  alarm_description   = "Elasticsearch index writes being blocker over last 10 minutes"
+  alarm_actions       = ["${local.aws_sns_topic_arn}"]
+  ok_actions          = ["${local.aws_sns_topic_arn}"]
+
+  dimensions {
+    DomainName = "${var.domain_name}"
+    ClientId   = "${data.aws_caller_identity.default.account_id}"
+  }
+}
+
+resource "aws_cloudwatch_metric_alarm" "insufficient_available_nodes" {
+  count               = "${var.monitor_insufficient_available_nodes}"
+  alarm_name          = "${var.alarm_name_prefix}ElasticSearch-InsufficientAvailableNodes${var.alarm_name_postfix}"
+  comparison_operator = "LessThanOrEqualToThreshold"
+  evaluation_periods  = "1"
+  metric_name         = "Nodes"
+  namespace           = "AWS/ES"
+  period              = "86400"
+  statistic           = "Minimum"
+  threshold           = "${local.thresholds["MinimumAvailableNodes"]}"
+  alarm_description   = "Elasticsearch nodes minimum < ${local.thresholds["MinimumAvailableNodes"]} for 1 day"
+  alarm_actions       = ["${local.aws_sns_topic_arn}"]
+  ok_actions          = ["${local.aws_sns_topic_arn}"]
+
+  dimensions {
+    DomainName = "${var.domain_name}"
+    ClientId   = "${data.aws_caller_identity.default.account_id}"
+  }
+}
+
+resource "aws_cloudwatch_metric_alarm" "automated_snapshot_failure" {
+  count               = "${var.monitor_automated_snapshot_failure}"
+  alarm_name          = "${var.alarm_name_prefix}ElasticSearch-AutomatedSnapshotFailure${var.alarm_name_postfix}"
+  comparison_operator = "GreaterThanOrEqualToThreshold"
+  evaluation_periods  = "1"
+  metric_name         = "AutomatedSnapshotFailure"
+  namespace           = "AWS/ES"
+  period              = "600"
+  statistic           = "Maximum"
+  threshold           = "1"
+  alarm_description   = "Elasticsearch automated snapshot failed over last 10 minutes"
+  alarm_actions       = ["${local.aws_sns_topic_arn}"]
+  ok_actions          = ["${local.aws_sns_topic_arn}"]
+
+  dimensions {
+    DomainName = "${var.domain_name}"
+    ClientId   = "${data.aws_caller_identity.default.account_id}"
+  }
+}
+
+resource "aws_cloudwatch_metric_alarm" "cpu_utilization_too_high" {
+  count               = "${var.monitor_cpu_utilization_too_high}"
+  alarm_name          = "${var.alarm_name_prefix}ElasticSearch-CPUUtilizationTooHigh${var.alarm_name_postfix}"
+  comparison_operator = "GreaterThanOrEqualToThreshold"
+  evaluation_periods  = "3"
+  metric_name         = "CPUUtilization"
+  namespace           = "AWS/ES"
+  period              = "900"
+  statistic           = "Average"
+  threshold           = "${local.thresholds["CPUUtilizationThreshold"]}"
+  alarm_description   = "Average elasticsearch cluster CPU utilization over last 10 minutes too high"
+  alarm_actions       = ["${local.aws_sns_topic_arn}"]
+  ok_actions          = ["${local.aws_sns_topic_arn}"]
+
+  dimensions {
+    DomainName = "${var.domain_name}"
+    ClientId   = "${data.aws_caller_identity.default.account_id}"
+  }
+}
+
+resource "aws_cloudwatch_metric_alarm" "jvm_memory_pressure_too_high" {
+  count               = "${var.monitor_jvm_memory_pressure_too_high}"
+  alarm_name          = "${var.alarm_name_prefix}ElasticSearch-JVMMemoryPressure${var.alarm_name_postfix}"
+  comparison_operator = "GreaterThanOrEqualToThreshold"
+  evaluation_periods  = "1"
+  metric_name         = "JVMMemoryPressure"
+  namespace           = "AWS/ES"
+  period              = "900"
+  statistic           = "Maximum"
+  threshold           = "${local.thresholds["JVMMemoryPressureThreshold"]}"
+  alarm_description   = "Elasticsearch JVM memory pressure is too high over last 10 minutes"
+  alarm_actions       = ["${local.aws_sns_topic_arn}"]
+  ok_actions          = ["${local.aws_sns_topic_arn}"]
+
+  dimensions {
+    DomainName = "${var.domain_name}"
+    ClientId   = "${data.aws_caller_identity.default.account_id}"
+  }
+}
+
+resource "aws_cloudwatch_metric_alarm" "master_cpu_utilization_too_high" {
+  count               = "${var.monitor_master_cpu_utilization_too_high}"
+  alarm_name          = "${var.alarm_name_prefix}ElasticSearch-MasterCPUUtilizationTooHigh${var.alarm_name_postfix}"
+  comparison_operator = "GreaterThanOrEqualToThreshold"
+  evaluation_periods  = "3"
+  metric_name         = "MasterCPUUtilization"
+  namespace           = "AWS/ES"
+  period              = "900"
+  statistic           = "Average"
+  threshold           = "${local.thresholds["MasterCPUUtilizationThreshold"]}"
+  alarm_description   = "Average elasticsearch cluster CPU utilization over last 10 minutes too high"
+  alarm_actions       = ["${local.aws_sns_topic_arn}"]
+  ok_actions          = ["${local.aws_sns_topic_arn}"]
+
+  dimensions {
+    DomainName = "${var.domain_name}"
+    ClientId   = "${data.aws_caller_identity.default.account_id}"
+  }
+}
+
+resource "aws_cloudwatch_metric_alarm" "master_jvm_memory_pressure_too_high" {
+  count               = "${var.monitor_master_jvm_memory_pressure_too_high}"
+  alarm_name          = "${var.alarm_name_prefix}ElasticSearch-JVMMemoryPressure${var.alarm_name_postfix}"
+  comparison_operator = "GreaterThanOrEqualToThreshold"
+  evaluation_periods  = "1"
+  metric_name         = "MasterJVMMemoryPressure"
+  namespace           = "AWS/ES"
+  period              = "900"
+  statistic           = "Maximum"
+  threshold           = "${local.thresholds["MasterJVMMemoryPressureThreshold"]}"
+  alarm_description   = "Elasticsearch JVM memory pressure is too high over last 10 minutes"
+  alarm_actions       = ["${local.aws_sns_topic_arn}"]
+  ok_actions          = ["${local.aws_sns_topic_arn}"]
+
+  dimensions {
+    DomainName = "${var.domain_name}"
+    ClientId   = "${data.aws_caller_identity.default.account_id}"
+  }
+}