Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
67 changes: 67 additions & 0 deletions .github/workflows/alert-deploy.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,67 @@
name: Deploy alerts
run-name: Deploy alerts for klass to test and prod

on:
push:
branches:
- main
paths:
- '.nais/test/*-alerts.yaml'
- '.nais/prod/*-alerts.yaml'
- '.github/workflows/alert-deploy.yml'

permissions:
id-token: write

jobs:
discover-alerts:
name: Discover alert files
runs-on: ubuntu-latest
outputs:
test-alerts: ${{ steps.discover.outputs.test }}
prod-alerts: ${{ steps.discover.outputs.prod }}
steps:
- uses: actions/checkout@v4

- id: discover
run: |
test_files=$(ls .nais/test/*-alerts.yaml 2>/dev/null | sort | jq -R -s -c 'split("\n")[:-1]')
prod_files=$(ls .nais/prod/*-alerts.yaml 2>/dev/null | sort | jq -R -s -c 'split("\n")[:-1]')

echo "Found test alerts: $test_files"
echo "Found prod alerts: $prod_files"

echo "test=$test_files" >> $GITHUB_OUTPUT
echo "prod=$prod_files" >> $GITHUB_OUTPUT

test-deploy:
name: Deploy alerts to test
needs: discover-alerts
runs-on: ubuntu-latest
strategy:
matrix:
resource: ${{ fromJson(needs.discover-alerts.outputs.test-alerts) }}
steps:
- uses: actions/checkout@v4
- name: Deploy ${{ matrix.resource }} to test
uses: nais/deploy/actions/deploy@v2
env:
CLUSTER: test
RESOURCE: ${{ matrix.resource }}
DEPLOY_SERVER: deploy.ssb.cloud.nais.io:443

prod-deploy:
name: Deploy alerts to prod
needs: discover-alerts
runs-on: ubuntu-latest
strategy:
matrix:
resource: ${{ fromJson(needs.discover-alerts.outputs.prod-alerts) }}
steps:
- uses: actions/checkout@v4
- name: Deploy ${{ matrix.resource }} to prod
uses: nais/deploy/actions/deploy@v2
env:
CLUSTER: prod
RESOURCE: ${{ matrix.resource }}
DEPLOY_SERVER: deploy.ssb.cloud.nais.io:443
48 changes: 48 additions & 0 deletions .nais/prod/klass-api-alerts.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,48 @@
apiVersion: "monitoring.coreos.com/v1"
kind: PrometheusRule
metadata:
name: alert-klass-api
namespace: dapla-metadata
labels:
team: dapla-metadata
spec:
groups:
- name: dapla-metadata
rules:
- alert: High number of errors
expr: (100 * sum by (app, namespace) (rate(logback_events_total{app="klass-api",level="error"}[3m])) / sum by (app, namespace) (rate(logback_events_total{app="klass-api"}[3m]))) > 10
for: 3m
annotations:
title: "High number of errors logged"
consequence: "There can be different causes for errors, check logs for cause and evaluation of consequences."
action: "`kubectl describe pod -l app=klass-api -n dapla-metadata` -> `kubectl logs <podname>`"
labels:
service: klass-api
namespace: dapla-metadata
severity: critical
environment: prod

- alert: A Klass-api client is unavailable
expr: rate(http_client_requests_seconds_count{app="klass-api", status!="200"}[1m]) > 0
for: 1m
annotations:
title: "A Klass-api client is unavailable "
consequence: "The service may lack some functionality"
description: "Client {{ $labels.serviceId }} responded with {{ $labels.status }} causing {{ $labels.exception }}"
labels:
service: klass-api
namespace: dapla-metadata
severity: critical
environment: prod

- alert: Klass-api is unavailable
expr: kube_deployment_status_replicas_available{deployment="klass-api"} == 0
for: 1m
annotations:
title: "Klass-api is unavailable"
consequence: "Service is unavailable to users. "
labels:
service: klass-api
namespace: dapla-metadata
severity: critical
environment: prod
48 changes: 48 additions & 0 deletions .nais/prod/klass-forvaltning-alerts.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,48 @@
apiVersion: "monitoring.coreos.com/v1"
kind: PrometheusRule
metadata:
name: alert-klass-forvaltning
namespace: dapla-metadata
labels:
team: dapla-metadata
spec:
groups:
- name: dapla-metadata
rules:
- alert: High number of errors
expr: (100 * sum by (app, namespace) (rate(logback_events_total{app="klass-forvaltning",level="error"}[3m])) / sum by (app, namespace) (rate(logback_events_total{app="klass-forvaltning"}[3m]))) > 10
for: 3m
annotations:
title: "High number of errors logged"
consequence: "There can be different causes for errors, check logs for cause and evaluation of consequences."
action: "`kubectl describe pod -l app=klass-forvaltning -n dapla-metadata` -> `kubectl logs <podname>`"
labels:
service: klass-forvaltning
namespace: dapla-metadata
severity: critical
environment: prod

- alert: A Klass-forvaltning client is unavailable
expr: rate(http_client_requests_seconds_count{app="klass-forvaltning", status!="200"}[1m]) > 0
for: 1m
annotations:
title: "A Klass-forvaltning client is unavailable "
consequence: "The service may lack some functionality"
description: "Client {{ $labels.serviceId }} responded with {{ $labels.status }} causing {{ $labels.exception }}"
labels:
service: klass-forvaltning
namespace: dapla-metadata
severity: critical
environment: prod

- alert: Klass-forvaltning is unavailable
expr: kube_deployment_status_replicas_available{deployment="klass-forvaltning"} == 0
for: 1m
annotations:
title: "Klass-forvaltning is unavailable"
consequence: "Service is unavailable to users. "
labels:
service: klass-forvaltning
namespace: dapla-metadata
severity: critical
environment: prod
48 changes: 48 additions & 0 deletions .nais/prod/klass-mail-alerts.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,48 @@
apiVersion: "monitoring.coreos.com/v1"
kind: PrometheusRule
metadata:
name: alert-klass-mail
namespace: dapla-metadata
labels:
team: dapla-metadata
spec:
groups:
- name: dapla-metadata
rules:
- alert: High number of errors
expr: (100 * sum by (app, namespace) (rate(logback_events_total{app="klass-mail",level="error"}[3m])) / sum by (app, namespace) (rate(logback_events_total{app="klass-mail"}[3m]))) > 10
for: 3m
annotations:
title: "High number of errors logged"
consequence: "There can be different causes for errors, check logs for cause and evaluation of consequences."
action: "`kubectl describe pod -l app=klass-mail -n dapla-metadata` -> `kubectl logs <podname>`"
labels:
service: klass-mail
namespace: dapla-metadata
severity: critical
environment: prod

- alert: A Klass-mail client is unavailable
expr: rate(http_client_requests_seconds_count{app="klass-mail", status!="200"}[1m]) > 0
for: 1m
annotations:
title: "A Klass-mail client is unavailable "
consequence: "The service may lack some functionality"
description: "Client {{ $labels.serviceId }} responded with {{ $labels.status }} causing {{ $labels.exception }}"
labels:
service: klass-mail
namespace: dapla-metadata
severity: critical
environment: prod

- alert: Klass-mail is unavailable
expr: kube_deployment_status_replicas_available{deployment="klass-mail"} == 0
for: 1m
annotations:
title: "Klass-mail is unavailable"
consequence: "Service is unavailable to users. "
labels:
service: klass-mail
namespace: dapla-metadata
severity: critical
environment: prod
22 changes: 22 additions & 0 deletions .nais/prod/klass-solr-alerts.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,22 @@
apiVersion: "monitoring.coreos.com/v1"
kind: PrometheusRule
metadata:
name: alert-klass-solr
namespace: dapla-metadata
labels:
team: dapla-metadata
spec:
groups:
- name: dapla-metadata
rules:
- alert: Klass-solr is unavailable
expr: kube_deployment_status_replicas_available{deployment="klass-solr"} == 0
for: 1m
annotations:
title: "Klass-solr is unavailable"
consequence: "Service is unavailable to users. "
labels:
service: klass-solr
namespace: dapla-metadata
severity: critical
environment: prod
48 changes: 48 additions & 0 deletions .nais/test/klass-api-alerts.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,48 @@
apiVersion: "monitoring.coreos.com/v1"
kind: PrometheusRule
metadata:
name: alert-klass-api
namespace: dapla-metadata
labels:
team: dapla-metadata
spec:
groups:
- name: dapla-metadata
rules:
- alert: High number of errors
expr: (100 * sum by (app, namespace) (rate(logback_events_total{app="klass-api",level="error"}[3m])) / sum by (app, namespace) (rate(logback_events_total{app="klass-api"}[3m]))) > 10
for: 3m
annotations:
title: "High number of errors logged"
consequence: "There can be different causes for errors, check logs for cause and evaluation of consequences."
action: "`kubectl describe pod -l app=klass-api -n dapla-metadata` -> `kubectl logs <podname>`"
labels:
service: klass-api
namespace: dapla-metadata
severity: critical
environment: test

- alert: A Klass-api client is unavailable
expr: rate(http_client_requests_seconds_count{app="klass-api", status!="200"}[1m]) > 0
for: 1m
annotations:
title: "A Klass-api client is unavailable "
consequence: "The service may lack some functionality"
description: "Client {{ $labels.serviceId }} responded with {{ $labels.status }} causing {{ $labels.exception }}"
labels:
service: klass-api
namespace: dapla-metadata
severity: critical
environment: test

- alert: Klass-api is unavailable
expr: kube_deployment_status_replicas_available{deployment="klass-api"} == 0
for: 1m
annotations:
title: "Klass-api is unavailable"
consequence: "Service is unavailable to users. "
labels:
service: klass-api
namespace: dapla-metadata
severity: critical
environment: test
48 changes: 48 additions & 0 deletions .nais/test/klass-forvaltning-alerts.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,48 @@
apiVersion: "monitoring.coreos.com/v1"
kind: PrometheusRule
metadata:
name: alert-klass-forvaltning
namespace: dapla-metadata
labels:
team: dapla-metadata
spec:
groups:
- name: dapla-metadata
rules:
- alert: High number of errors
expr: (100 * sum by (app, namespace) (rate(logback_events_total{app="klass-forvaltning",level="error"}[3m])) / sum by (app, namespace) (rate(logback_events_total{app="klass-forvaltning"}[3m]))) > 10
for: 3m
annotations:
title: "High number of errors logged"
consequence: "There can be different causes for errors, check logs for cause and evaluation of consequences."
action: "`kubectl describe pod -l app=klass-forvaltning -n dapla-metadata` -> `kubectl logs <podname>`"
labels:
service: klass-forvaltning
namespace: dapla-metadata
severity: critical
environment: test

- alert: A Klass-forvaltning client is unavailable
expr: rate(http_client_requests_seconds_count{app="klass-forvaltning", status!="200"}[1m]) > 0
for: 1m
annotations:
title: "A Klass-forvaltning client is unavailable "
consequence: "The service may lack some functionality"
description: "Client {{ $labels.serviceId }} responded with {{ $labels.status }} causing {{ $labels.exception }}"
labels:
service: klass-forvaltning
namespace: dapla-metadata
severity: critical
environment: test

- alert: Klass-forvaltning is unavailable
expr: kube_deployment_status_replicas_available{deployment="klass-forvaltning"} == 0
for: 1m
annotations:
title: "Klass-forvaltning is unavailable"
consequence: "Service is unavailable to users. "
labels:
service: klass-forvaltning
namespace: dapla-metadata
severity: critical
environment: test
48 changes: 48 additions & 0 deletions .nais/test/klass-mail-alerts.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,48 @@
apiVersion: "monitoring.coreos.com/v1"
kind: PrometheusRule
metadata:
name: alert-klass-mail
namespace: dapla-metadata
labels:
team: dapla-metadata
spec:
groups:
- name: dapla-metadata
rules:
- alert: High number of errors
expr: (100 * sum by (app, namespace) (rate(logback_events_total{app="klass-mail",level="error"}[3m])) / sum by (app, namespace) (rate(logback_events_total{app="klass-mail"}[3m]))) > 10
for: 3m
annotations:
title: "High number of errors logged"
consequence: "There can be different causes for errors, check logs for cause and evaluation of consequences."
action: "`kubectl describe pod -l app=klass-mail -n dapla-metadata` -> `kubectl logs <podname>`"
labels:
service: klass-mail
namespace: dapla-metadata
severity: critical
environment: test

- alert: A Klass-mail client is unavailable
expr: rate(http_client_requests_seconds_count{app="klass-mail", status!="200"}[1m]) > 0
for: 1m
annotations:
title: "A Klass-mail client is unavailable "
consequence: "The service may lack some functionality"
description: "Client {{ $labels.serviceId }} responded with {{ $labels.status }} causing {{ $labels.exception }}"
labels:
service: klass-mail
namespace: dapla-metadata
severity: critical
environment: test

- alert: Klass-mail is unavailable
expr: kube_deployment_status_replicas_available{deployment="klass-mail"} == 0
for: 1m
annotations:
title: "Klass-mail is unavailable"
consequence: "Service is unavailable to users. "
labels:
service: klass-mail
namespace: dapla-metadata
severity: critical
environment: test
Loading
Loading