diff --git a/alert-policies/pipeline-control/CPUUsage.yaml b/alert-policies/pipeline-control/CPUUsage.yaml index 8bff972324..cf69a364cf 100644 --- a/alert-policies/pipeline-control/CPUUsage.yaml +++ b/alert-policies/pipeline-control/CPUUsage.yaml @@ -9,7 +9,7 @@ type: STATIC # NRQL query nrql: - query: "SELECT rate(sum(otelcol_process_cpu_seconds), 1 SECOND) FROM Metric WHERE clusterName = 'YOUR_CLUSTER_NAME' AND serviceName = 'pipeline-control-gateway' FACET podName" + query: "SELECT rate(sum(otelcol_process_cpu_seconds), 1 SECOND) FROM Metric WHERE serviceName = 'pipeline-control-gateway' FACET podName, clusterName" # Function used to aggregate the NRQL query value(s) for comparison to the terms.threshold (Default: SINGLE_VALUE) valueFunction: SINGLE_VALUE diff --git a/alert-policies/pipeline-control/DeploymentUnHealthy.yaml b/alert-policies/pipeline-control/DeploymentUnHealthy.yaml index 7f2f44362e..a4e7ee14e9 100644 --- a/alert-policies/pipeline-control/DeploymentUnHealthy.yaml +++ b/alert-policies/pipeline-control/DeploymentUnHealthy.yaml @@ -9,7 +9,7 @@ type: STATIC # NRQL query nrql: - query: "SELECT latest(podsDesired) - latest(podsReady) FROM K8sReplicasetSample WHERE clusterName = 'YOUR_CLUSTER_NAME' AND serviceName = 'pipeline-control-gateway' FACET replicasetName" + query: "SELECT latest(podsDesired) - latest(podsReady) FROM K8sReplicasetSample WHERE serviceName = 'pipeline-control-gateway' FACET replicasetName, clusterName" # Function used to aggregate the NRQL query value(s) for comparison to the terms.threshold (Default: SINGLE_VALUE) valueFunction: SINGLE_VALUE diff --git a/alert-policies/pipeline-control/DropRuleRegistrationFailure.yaml b/alert-policies/pipeline-control/DropRuleRegistrationFailure.yaml index fe91ab0c2d..0194088800 100644 --- a/alert-policies/pipeline-control/DropRuleRegistrationFailure.yaml +++ b/alert-policies/pipeline-control/DropRuleRegistrationFailure.yaml @@ -9,7 +9,7 @@ type: STATIC # NRQL query nrql: - query: "SELECT latest(otelcol_nrprocessor_rules_failed) FROM Metric WHERE clusterName = 'YOUR_CLUSTER_NAME' AND serviceName = 'pipeline-control-gateway' FACET podName" + query: "SELECT latest(otelcol_nrprocessor_rules_failed) FROM Metric WHERE serviceName = 'pipeline-control-gateway' FACET podName, clusterName" # Function used to aggregate the NRQL query value(s) for comparison to the terms.threshold (Default: SINGLE_VALUE) valueFunction: SINGLE_VALUE diff --git a/alert-policies/pipeline-control/ErrorRate.yaml b/alert-policies/pipeline-control/ErrorRate.yaml new file mode 100644 index 0000000000..97a80ba694 --- /dev/null +++ b/alert-policies/pipeline-control/ErrorRate.yaml @@ -0,0 +1,28 @@ +name: High error rate + +description: |+ + Alert triggered when the error rate(4xx/5xx errors) exceeds 5% during 5 minutes + +type: STATIC + +nrql: + query: "SELECT percentage(count(StatusCode),where numeric(StatusCode) >= 400 and numeric(StatusCode) <= 599) FROM Metric where metricName='otelcol_nrexporter_pcg_requests_latency' AND serviceName = 'pipeline-control-gateway' FACET podName, clusterName" + +# Function used to aggregate the NRQL query value(s) for comparison to the terms.threshold (Default: SINGLE_VALUE) +valueFunction: SINGLE_VALUE + +# List of Critical and Warning thresholds for the condition +terms: + - priority: CRITICAL + # Operator used to compare against the threshold. + operator: ABOVE + # Value that triggers a violation + threshold: 5 + # Time in seconds; 120 - 3600 + thresholdDuration: 300 + # How many data points must be in violation for the duration + thresholdOccurrences: ALL + +# Duration after which a violation automatically closes +# Time in seconds; 300 - 2592000 (Default: 86400 [1 day]) +violationTimeLimitSeconds: 86400 \ No newline at end of file diff --git a/alert-policies/pipeline-control/Errors.yaml b/alert-policies/pipeline-control/Errors.yaml index 57bb1d2c2b..b1979bd9ad 100644 --- a/alert-policies/pipeline-control/Errors.yaml +++ b/alert-policies/pipeline-control/Errors.yaml @@ -9,7 +9,7 @@ type: STATIC # NRQL query nrql: - query: "SELECT rate(sum(otelcol_nrreceiver_incoming_request_errors), 1 second) + rate(sum(otelcol_nrexporter_outgoing_requests_errors), 1 second) AS ERROR FROM Metric WHERE clusterName = 'YOUR_CLUSTER_NAME' AND serviceName = 'pipeline-control-gateway' FACET podName" + query: "SELECT rate(sum(otelcol_nrreceiver_incoming_request_errors), 1 second) + rate(sum(otelcol_nrexporter_outgoing_requests_errors), 1 second) AS ERROR FROM Metric WHERE serviceName = 'pipeline-control-gateway' FACET podName,clusterName" # Function used to aggregate the NRQL query value(s) for comparison to the terms.threshold (Default: SINGLE_VALUE) valueFunction: SINGLE_VALUE diff --git a/alert-policies/pipeline-control/MemoryUsage.yaml b/alert-policies/pipeline-control/MemoryUsage.yaml index fd27d0303c..b5a7aa47c1 100644 --- a/alert-policies/pipeline-control/MemoryUsage.yaml +++ b/alert-policies/pipeline-control/MemoryUsage.yaml @@ -9,7 +9,7 @@ type: STATIC # NRQL query nrql: - query: "SELECT average(otelcol_process_memory_rss) FROM Metric WHERE clusterName = 'YOUR_CLUSTER_NAME' AND serviceName = 'pipeline-control-gateway' FACET podName" + query: "SELECT average(otelcol_process_memory_rss) FROM Metric WHERE serviceName = 'pipeline-control-gateway' FACET podName,clusterName" # Function used to aggregate the NRQL query value(s) for comparison to the terms.threshold (Default: SINGLE_VALUE) valueFunction: SINGLE_VALUE diff --git a/alert-policies/pipeline-control/RequestLatency.yaml b/alert-policies/pipeline-control/RequestLatency.yaml index a4aaff1500..9174472976 100644 --- a/alert-policies/pipeline-control/RequestLatency.yaml +++ b/alert-policies/pipeline-control/RequestLatency.yaml @@ -9,7 +9,7 @@ type: STATIC # NRQL query nrql: - query: "SELECT percentile(otelcol_nrexporter_pcg_requests_latency, 99) / 1000 FROM Metric WHERE clusterName = 'YOUR_CLUSTER_NAME' AND serviceName = 'pipeline-control-gateway' FACET podName" + query: "SELECT percentile(otelcol_nrexporter_pcg_requests_latency, 99) / 1000 FROM Metric WHERE serviceName = 'pipeline-control-gateway' FACET podName,clusterName" # Function used to aggregate the NRQL query value(s) for comparison to the terms.threshold (Default: SINGLE_VALUE) valueFunction: SINGLE_VALUE diff --git a/alert-policies/pipeline-control/ServiceUnavailability.yaml b/alert-policies/pipeline-control/ServiceUnavailability.yaml new file mode 100644 index 0000000000..0b21e8cf94 --- /dev/null +++ b/alert-policies/pipeline-control/ServiceUnavailability.yaml @@ -0,0 +1,31 @@ +name: Service Unavailability + +# Description and details +description: | + The service health status metric has not been reported in the last 5 minutes, indicating potential unavailability. + +# Type of alert: BASELINE | STATIC +type: STATIC + +# NRQL query +nrql: + query: "SELECT average(otelcol_nrreceiver_health_status) FROM Metric WHERE metricName = 'otelcol_nrreceiver_health_status' AND serviceName = 'pipeline-control-gateway' facet podName, clusterName" + +# Function used to aggregate the NRQL query value(s) for comparison to the terms.threshold (Default: SINGLE_VALUE) +valueFunction: SINGLE_VALUE + +# List of Critical and Warning thresholds for the condition +terms: + - priority: CRITICAL + # Operator used to compare against the threshold. + operator: BELOW + # Value that triggers a violation + threshold: 1 + # Time in seconds; 120 - 3600 + thresholdDuration: 300 + # How many data points must be in violation for the duration + thresholdOccurrences: ALL + +# Duration after which a violation automatically closes +# Time in seconds; 300 - 2592000 (Default: 86400 [1 day]) +violationTimeLimitSeconds: 86400 \ No newline at end of file