Skip to content

Commit 62ae71e

Browse files
alvarocabanasDavSanchez
authored andcommitted
ci(canaries): add alerting to provisioned k8s clusters
This adds a new Terraform module to the canaries infra that, using the New Relic provider, creates alerts for different scenarios. Currently there are basic alerts in place for general resource usage, but more could be added (and their thresholds updated) over time.
1 parent 97e3d47 commit 62ae71e

File tree

9 files changed

+269
-12
lines changed

9 files changed

+269
-12
lines changed

test/k8s-canaries/Makefile

Lines changed: 43 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -17,18 +17,35 @@ ifndef CANARY_DIR
1717
@echo "CANARY_DIR variable must be provided to know which canary to terraform-plan"
1818
exit 1
1919
endif
20-
terraform -chdir=$(TERRAFORM_DIR)/$(CANARY_DIR) init && \
21-
terraform -chdir=$(TERRAFORM_DIR)/$(CANARY_DIR) plan
20+
ifndef NEW_RELIC_ACCOUNT_ID
21+
@echo "NEW_RELIC_ACCOUNT_ID variable must be provided for test/k8s-canaries/terraform-plan"
22+
exit 1
23+
endif
24+
ifndef NEW_RELIC_API_KEY
25+
@echo "NEW_RELIC_API_KEY variable must be provided for test/k8s-canaries/terraform-plan"
26+
exit 1
27+
endif
28+
@terraform -chdir=$(TERRAFORM_DIR)/$(CANARY_DIR) init && \
29+
terraform -chdir=$(TERRAFORM_DIR)/$(CANARY_DIR) plan \
30+
-var="api_key=$(NEW_RELIC_API_KEY)" -var="account_id=$(NEW_RELIC_ACCOUNT_ID)"
2231

2332
.PHONY: test/k8s-canaries/terraform-apply
2433
test/k8s-canaries/terraform-apply:
2534
ifndef CANARY_DIR
2635
@echo "CANARY_DIR variable must be provided to know which canary to terraform-apply"
2736
exit 1
2837
endif
29-
terraform -chdir=$(TERRAFORM_DIR)/$(CANARY_DIR) init && \
30-
terraform -chdir=$(TERRAFORM_DIR)/$(CANARY_DIR) apply -auto-approve
31-
38+
ifndef NEW_RELIC_ACCOUNT_ID
39+
@echo "NEW_RELIC_ACCOUNT_ID variable must be provided for test/k8s-canaries/terraform-plan"
40+
exit 1
41+
endif
42+
ifndef NEW_RELIC_API_KEY
43+
@echo "NEW_RELIC_API_KEY variable must be provided for test/k8s-canaries/terraform-plan"
44+
exit 1
45+
endif
46+
@terraform -chdir=$(TERRAFORM_DIR)/$(CANARY_DIR) init && \
47+
terraform -chdir=$(TERRAFORM_DIR)/$(CANARY_DIR) apply -auto-approve \
48+
-var="api_key=$(NEW_RELIC_API_KEY)" -var="account_id=$(NEW_RELIC_ACCOUNT_ID)"
3249

3350
.PHONY: test/k8s-canaries/update-kubeconfig-from-aws
3451
test/k8s-canaries/update-kubeconfig-from-aws:
@@ -51,12 +68,30 @@ endif
5168
ifndef IMAGE_TAG
5269
@echo "IMAGE_TAG variable must be provided for test/k8s-canaries/helm-upgrade"
5370
exit 1
71+
endif
72+
ifndef NR_SYSTEM_IDENTITY_CLIENT_ID
73+
@echo "NR_SYSTEM_IDENTITY_CLIENT_ID variable must be provided for test/k8s-canaries/helm-upgrade"
74+
exit 1
75+
endif
76+
ifndef NR_SYSTEM_IDENTITY_PRIVATE_KEY
77+
@echo "NR_SYSTEM_IDENTITY_PRIVATE_KEY variable must be provided for test/k8s-canaries/helm-upgrade"
78+
exit 1
5479
endif
5580
@helm repo add newrelic https://helm-charts.newrelic.com
81+
@kubectl create namespace newrelic --dry-run=client -o yaml | kubectl apply -f -
82+
@echo -n $$NR_SYSTEM_IDENTITY_PRIVATE_KEY > /tmp/private_key
83+
@kubectl get secret sys-identity --namespace newrelic || \
84+
kubectl create secret generic sys-identity \
85+
--namespace newrelic \
86+
--from-literal=CLIENT_ID=$$NR_SYSTEM_IDENTITY_CLIENT_ID \
87+
--from-file=private_key=/tmp/private_key
5688
@helm upgrade --install ac newrelic/agent-control --devel -f $(HELM_DIR)/agent-control.yml \
57-
-n newrelic --create-namespace \
89+
--namespace newrelic \
5890
--set global.licenseKey=$(NR_LICENSE_KEY) \
5991
--set global.cluster=$(CLUSTER_NAME) \
92+
--set global.nrStaging=true \
6093
--set agent-control-deployment.image.tag=$(IMAGE_TAG) \
61-
--set agent-control-deployment.podAnnotations.deploymentKey="${DEPLOYMENT_KEY}"
62-
94+
--set agent-control-deployment.config.fleet_control.auth.secret.create=false \
95+
--set agent-control-deployment.config.fleet_control.auth.secret.name="sys-identity" \
96+
--set agent-control-deployment.config.fleet_control.fleet_id="MTIyMTMwNjh8TkdFUHxGTEVFVHwwMTk0ZTAwMy03NGEyLTdiZTEtODk5NS0zZjgyN2E0MjBlMjA" \
97+
--set agent-control-deployment.podAnnotations.deploymentKey="$(DEPLOYMENT_KEY)"

test/k8s-canaries/helm/agent-control.yml

Lines changed: 0 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -3,8 +3,6 @@ agent-control-deployment:
33
image:
44
imagePullPolicy: Always
55
config:
6-
fleet_control:
7-
enabled: false
86
agentControl:
97
content:
108
log:
Lines changed: 52 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,52 @@
1+
resource "newrelic_alert_policy" "alert_k8s_canary" {
2+
name = format("%s: %s", var.region, var.cluster_name)
3+
}
4+
5+
locals {
6+
policies_with_cluster_name = [
7+
for pol in var.conditions : {
8+
policy_id = newrelic_alert_policy.alert_k8s_canary.id
9+
cluster_name = var.cluster_name
10+
condition = pol
11+
}
12+
]
13+
}
14+
15+
# Uncomment this to "debug" the generated structure
16+
#output prueba {
17+
# value = local.policies_with_display_names
18+
#}
19+
20+
resource "newrelic_nrql_alert_condition" "condition_nrql_canary" {
21+
count = length(local.policies_with_cluster_name)
22+
23+
account_id = var.account_id
24+
policy_id = local.policies_with_cluster_name[count.index].policy_id
25+
name = local.policies_with_cluster_name[count.index].condition.name
26+
violation_time_limit_seconds = 3600
27+
28+
nrql {
29+
query = templatefile(
30+
local.policies_with_cluster_name[count.index].condition.template_name,
31+
merge(
32+
{
33+
"cluster_name" : "${local.policies_with_cluster_name[count.index].cluster_name}",
34+
"function" : null,
35+
"wheres" : {}
36+
},
37+
local.policies_with_cluster_name[count.index].condition
38+
)
39+
)
40+
}
41+
42+
critical {
43+
operator = local.policies_with_cluster_name[count.index].condition.operator
44+
threshold = local.policies_with_cluster_name[count.index].condition.threshold
45+
threshold_duration = local.policies_with_cluster_name[count.index].condition.duration
46+
threshold_occurrences = "ALL"
47+
}
48+
}
49+
50+
output "queries" {
51+
value = [newrelic_nrql_alert_condition.condition_nrql_canary.*.nrql]
52+
}
Lines changed: 16 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,16 @@
1+
# Configure terraform.
2+
terraform {
3+
required_version = "~> 1.0"
4+
required_providers {
5+
newrelic = {
6+
source = "newrelic/newrelic"
7+
}
8+
}
9+
}
10+
11+
# Configure the New Relic provider.
12+
provider "newrelic" {
13+
account_id = var.account_id
14+
api_key = var.api_key
15+
region = var.region
16+
}
Lines changed: 54 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,54 @@
1+
# NR Account ID
2+
variable "account_id" {
3+
default = ""
4+
}
5+
6+
# NR User Api Key
7+
variable "api_key" {
8+
default = ""
9+
}
10+
11+
# US/EU/Staging
12+
variable "region" {
13+
default = "US"
14+
15+
validation {
16+
condition = can(regex("^(US|EU|Staging)$", var.region))
17+
error_message = "Unsupported region"
18+
}
19+
}
20+
21+
variable "cluster_name" {
22+
default = "Agent_Control_Canaries_Staging-Cluster"
23+
}
24+
25+
variable "policies_prefix" {
26+
default = "[Staging] Agent Control canaries metric monitoring"
27+
}
28+
29+
# conditions should follow next structure:
30+
#[
31+
# {
32+
# name = "System / Core Count"
33+
# metric = "coreCount"
34+
# sample = "SystemSample"
35+
# threshold = 0
36+
# duration = 600
37+
# operator = "above"
38+
# template_name = "generic_metric_comparator"
39+
# },
40+
# {
41+
# name = "System / Cpu IOWait Percent"
42+
# metric = "cpuIOWaitPercent"
43+
# sample = "SystemSample"
44+
# threshold = 0.5 # max 0.112 in last week
45+
# duration = 600
46+
# operator = "above"
47+
# template_name = "generic_metric_comparator"
48+
# },
49+
# ...
50+
# ]
51+
#
52+
variable "conditions" {
53+
default = []
54+
}
Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,9 @@
1+
SELECT max(${metric})
2+
FROM ${sample}
3+
WHERE (
4+
clusterName = '${cluster_name}'
5+
AND deploymentName = 'ac-agent-control'
6+
%{ for k, v in wheres }
7+
AND ${k}='${v}'
8+
%{ endfor }
9+
)
Lines changed: 42 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,8 +1,49 @@
1-
# Use the EKS cluster module
1+
# Use the EKS cluster module
22
module "eks_cluster" {
33
source = "../modules/eks_cluster"
44
canary_name = "Agent_Control_Canaries_Production"
55
cluster_desired_size = 2
66
cluster_max_size = 3
77
cluster_min_size = 2
88
}
9+
10+
variable "account_id" {}
11+
variable "api_key" {}
12+
module "alerts" {
13+
source = "../modules/nr_alerts"
14+
15+
api_key = var.api_key
16+
account_id = var.account_id
17+
policies_prefix = "Agent Control canaries metric monitoring"
18+
conditions = [
19+
{
20+
name = "CPU usage (cores)"
21+
metric = "cpuUsedCores"
22+
sample = "K8sContainerSample"
23+
threshold = 1
24+
duration = 3600
25+
operator = "above"
26+
template_name = "./alert_nrql_templates/generic_metric_threshold.tftpl"
27+
},
28+
{
29+
name = "Memory usage (bytes)"
30+
metric = "memoryWorkingSetBytes"
31+
sample = "K8sContainerSample"
32+
threshold = 10000000 # 10 MB
33+
duration = 600
34+
operator = "above"
35+
template_name = "./alert_nrql_templates/generic_metric_threshold.tftpl"
36+
},
37+
{
38+
name = "Storage usage (bytes)"
39+
metric = "fsUsedBytes"
40+
sample = "K8sContainerSample"
41+
threshold = 10000 # 10 KB
42+
duration = 3600
43+
operator = "above"
44+
template_name = "./alert_nrql_templates/generic_metric_threshold.tftpl"
45+
},
46+
]
47+
region = "US"
48+
cluster_name = "Agent_Control_Canaries_Production"
49+
}
Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,9 @@
1+
SELECT max(${metric})
2+
FROM ${sample}
3+
WHERE (
4+
clusterName = '${cluster_name}'
5+
AND deploymentName = 'ac-agent-control'
6+
%{ for k, v in wheres }
7+
AND ${k}='${v}'
8+
%{ endfor }
9+
)
Lines changed: 44 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,8 +1,51 @@
1-
# Use the EKS cluster module
1+
# Use the EKS cluster module
22
module "eks_cluster" {
33
source = "../modules/eks_cluster"
44
canary_name = "Agent_Control_Canaries_Staging"
55
cluster_desired_size = 2
66
cluster_max_size = 3
77
cluster_min_size = 2
88
}
9+
10+
11+
variable "account_id" {}
12+
variable "api_key" {}
13+
module "alerts" {
14+
source = "../modules/nr_alerts"
15+
16+
api_key = var.api_key
17+
account_id = var.account_id
18+
policies_prefix = "Agent Control canaries metric monitoring"
19+
conditions = [
20+
{
21+
name = "CPU usage (cores)"
22+
metric = "cpuUsedCores"
23+
sample = "K8sContainerSample"
24+
threshold = 1
25+
duration = 3600
26+
operator = "above"
27+
template_name = "./alert_nrql_templates/generic_metric_threshold.tftpl"
28+
},
29+
{
30+
name = "Memory usage (bytes)"
31+
metric = "memoryWorkingSetBytes"
32+
sample = "K8sContainerSample"
33+
threshold = 10000000 # 10 MB
34+
duration = 600
35+
operator = "above"
36+
template_name = "./alert_nrql_templates/generic_metric_threshold.tftpl"
37+
},
38+
{
39+
name = "Storage usage (bytes)"
40+
metric = "fsUsedBytes"
41+
sample = "K8sContainerSample"
42+
threshold = 10000 # 10 KB
43+
duration = 3600
44+
operator = "above"
45+
template_name = "./alert_nrql_templates/generic_metric_threshold.tftpl"
46+
},
47+
]
48+
region = "Staging"
49+
cluster_name = "Agent_Control_Canaries_Staging"
50+
}
51+

0 commit comments

Comments
 (0)