Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion alert-policies/pipeline-control/CPUUsage.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,7 @@ type: STATIC

# NRQL query
nrql:
query: "SELECT rate(sum(otelcol_process_cpu_seconds), 1 SECOND) FROM Metric WHERE clusterName = 'YOUR_CLUSTER_NAME' AND serviceName = 'pipeline-control-gateway' FACET podName"
query: "SELECT rate(sum(otelcol_process_cpu_seconds), 1 SECOND) FROM Metric WHERE serviceName = 'pipeline-control-gateway' FACET podName, clusterName"

# Function used to aggregate the NRQL query value(s) for comparison to the terms.threshold (Default: SINGLE_VALUE)
valueFunction: SINGLE_VALUE
Expand Down
2 changes: 1 addition & 1 deletion alert-policies/pipeline-control/DeploymentUnHealthy.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,7 @@ type: STATIC

# NRQL query
nrql:
query: "SELECT latest(podsDesired) - latest(podsReady) FROM K8sReplicasetSample WHERE clusterName = 'YOUR_CLUSTER_NAME' AND serviceName = 'pipeline-control-gateway' FACET replicasetName"
query: "SELECT latest(podsDesired) - latest(podsReady) FROM K8sReplicasetSample WHERE serviceName = 'pipeline-control-gateway' FACET replicasetName, clusterName"

# Function used to aggregate the NRQL query value(s) for comparison to the terms.threshold (Default: SINGLE_VALUE)
valueFunction: SINGLE_VALUE
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,7 @@ type: STATIC

# NRQL query
nrql:
query: "SELECT latest(otelcol_nrprocessor_rules_failed) FROM Metric WHERE clusterName = 'YOUR_CLUSTER_NAME' AND serviceName = 'pipeline-control-gateway' FACET podName"
query: "SELECT latest(otelcol_nrprocessor_rules_failed) FROM Metric WHERE serviceName = 'pipeline-control-gateway' FACET podName, clusterName"

# Function used to aggregate the NRQL query value(s) for comparison to the terms.threshold (Default: SINGLE_VALUE)
valueFunction: SINGLE_VALUE
Expand Down
28 changes: 28 additions & 0 deletions alert-policies/pipeline-control/ErrorRate.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,28 @@
name: High error rate

description: |+
Alert triggered when the error rate(4xx/5xx errors) exceeds 5% during 5 minutes

type: STATIC

nrql:
query: "SELECT percentage(count(StatusCode),where numeric(StatusCode) >= 400 and numeric(StatusCode) <= 599) FROM Metric where metricName='otelcol_nrexporter_pcg_requests_latency' AND serviceName = 'pipeline-control-gateway' FACET podName, clusterName"

# Function used to aggregate the NRQL query value(s) for comparison to the terms.threshold (Default: SINGLE_VALUE)
valueFunction: SINGLE_VALUE

# List of Critical and Warning thresholds for the condition
terms:
- priority: CRITICAL
# Operator used to compare against the threshold.
operator: ABOVE
# Value that triggers a violation
threshold: 5
# Time in seconds; 120 - 3600
thresholdDuration: 300
# How many data points must be in violation for the duration
thresholdOccurrences: ALL

# Duration after which a violation automatically closes
# Time in seconds; 300 - 2592000 (Default: 86400 [1 day])
violationTimeLimitSeconds: 86400
2 changes: 1 addition & 1 deletion alert-policies/pipeline-control/Errors.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,7 @@ type: STATIC

# NRQL query
nrql:
query: "SELECT rate(sum(otelcol_nrreceiver_incoming_request_errors), 1 second) + rate(sum(otelcol_nrexporter_outgoing_requests_errors), 1 second) AS ERROR FROM Metric WHERE clusterName = 'YOUR_CLUSTER_NAME' AND serviceName = 'pipeline-control-gateway' FACET podName"
query: "SELECT rate(sum(otelcol_nrreceiver_incoming_request_errors), 1 second) + rate(sum(otelcol_nrexporter_outgoing_requests_errors), 1 second) AS ERROR FROM Metric WHERE serviceName = 'pipeline-control-gateway' FACET podName,clusterName"

# Function used to aggregate the NRQL query value(s) for comparison to the terms.threshold (Default: SINGLE_VALUE)
valueFunction: SINGLE_VALUE
Expand Down
2 changes: 1 addition & 1 deletion alert-policies/pipeline-control/MemoryUsage.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,7 @@ type: STATIC

# NRQL query
nrql:
query: "SELECT average(otelcol_process_memory_rss) FROM Metric WHERE clusterName = 'YOUR_CLUSTER_NAME' AND serviceName = 'pipeline-control-gateway' FACET podName"
query: "SELECT average(otelcol_process_memory_rss) FROM Metric WHERE serviceName = 'pipeline-control-gateway' FACET podName,clusterName"

# Function used to aggregate the NRQL query value(s) for comparison to the terms.threshold (Default: SINGLE_VALUE)
valueFunction: SINGLE_VALUE
Expand Down
2 changes: 1 addition & 1 deletion alert-policies/pipeline-control/RequestLatency.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,7 @@ type: STATIC

# NRQL query
nrql:
query: "SELECT percentile(otelcol_nrexporter_pcg_requests_latency, 99) / 1000 FROM Metric WHERE clusterName = 'YOUR_CLUSTER_NAME' AND serviceName = 'pipeline-control-gateway' FACET podName"
query: "SELECT percentile(otelcol_nrexporter_pcg_requests_latency, 99) / 1000 FROM Metric WHERE serviceName = 'pipeline-control-gateway' FACET podName,clusterName"

# Function used to aggregate the NRQL query value(s) for comparison to the terms.threshold (Default: SINGLE_VALUE)
valueFunction: SINGLE_VALUE
Expand Down
31 changes: 31 additions & 0 deletions alert-policies/pipeline-control/ServiceUnavailability.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,31 @@
name: Service Unavailability

# Description and details
description: |
The service health status metric has not been reported in the last 5 minutes, indicating potential unavailability.

# Type of alert: BASELINE | STATIC
type: STATIC

# NRQL query
nrql:
query: "SELECT average(otelcol_nrreceiver_health_status) FROM Metric WHERE metricName = 'otelcol_nrreceiver_health_status' AND serviceName = 'pipeline-control-gateway' facet podName, clusterName"

# Function used to aggregate the NRQL query value(s) for comparison to the terms.threshold (Default: SINGLE_VALUE)
valueFunction: SINGLE_VALUE

# List of Critical and Warning thresholds for the condition
terms:
- priority: CRITICAL
# Operator used to compare against the threshold.
operator: BELOW
# Value that triggers a violation
threshold: 1
# Time in seconds; 120 - 3600
thresholdDuration: 300
# How many data points must be in violation for the duration
thresholdOccurrences: ALL

# Duration after which a violation automatically closes
# Time in seconds; 300 - 2592000 (Default: 86400 [1 day])
violationTimeLimitSeconds: 86400
Loading