Skip to content

Commit c63839c

Browse files
committed
feat: add initial prometheus metrics support
--- Metrics for the prometheus endpoint are slightly different from the json one. - I do not want to break things for those that use the old endpoint. - Some metrics are hard to utilize in Prometheus without a format change. (example: certificate timestamp) - Usually blackbox_exporter would cover certificate monitoring, but since people use TRMM with multiple proxies both internal monitoring and blackbox_exporter may be useful. - Memory and disk space was not transfered because they are system wide and Prometheus users likely will have node_exporter installed for that. - Prometheus format has slightly more detailed metrics. --- Instructions: 1. Setup MON_TOKEN variable in `/rmm/api/tacticalrmm/tacticalrmm/local_settings.py` for your bearer_token. This is the same as for the json endpoint. See [Tips and Tricks](https://docs.tacticalrmm.com/tipsntricks/#monitor-your-trmm-instance-via-the-built-in-monitoring-endpoint). 2. Test with curl command: `curl -s -H "Authorization: Bearer $MON_TOKEN" https://api.trmm.example.com/core/status/` 3. Setup Prometheus job with: ``` - job_name: trmm scrape_interval: 60s metrics_path: /core/status/ scheme: https bearer_token: $MON_TOKEN static_configs: - targets: - api.trmm.example.com ```
1 parent b28316a commit c63839c

File tree

3 files changed

+309
-48
lines changed

3 files changed

+309
-48
lines changed

api/tacticalrmm/core/decorators.py

+25-13
Original file line numberDiff line numberDiff line change
@@ -6,23 +6,35 @@
66

77
def monitoring_view(function):
88
def wrap(request, *args, **kwargs):
9-
if request.method != "POST":
10-
return HttpResponse("Invalid request type\n", status=400)
9+
if request.method == "POST":
10+
try:
11+
data = json.loads(request.body)
12+
except:
13+
return HttpResponse("Invalid json\n", status=400)
14+
15+
if "auth" not in data.keys():
16+
return HttpResponse("Invalid payload\n", status=400)
17+
18+
token = getattr(settings, "MON_TOKEN", "")
19+
if not token:
20+
return HttpResponse("Missing token\n", status=401)
1121

12-
try:
13-
data = json.loads(request.body)
14-
except:
15-
return HttpResponse("Invalid json\n", status=400)
22+
if data.get("auth") != token:
23+
return HttpResponse("Not authenticated\n", status=401)
1624

17-
if "auth" not in data.keys():
18-
return HttpResponse("Invalid payload\n", status=400)
25+
elif request.method == "GET":
26+
if "Authorization" not in request.headers:
27+
return HttpResponse("Missing 'Authorization' header\n", status=400)
1928

20-
token = getattr(settings, "MON_TOKEN", "")
21-
if not token:
22-
return HttpResponse("Missing token\n", status=401)
29+
token = getattr(settings, "MON_TOKEN", "")
30+
if not token:
31+
return HttpResponse("Missing token\n", status=401)
2332

24-
if data.get("auth") != token:
25-
return HttpResponse("Not authenticated\n", status=401)
33+
if request.headers["Authorization"] != "Bearer " + token:
34+
return HttpResponse("Not authenticated\n", status=401)
35+
36+
else:
37+
return HttpResponse("Invalid request type\n", status=400)
2638

2739
return function(request, *args, **kwargs)
2840

api/tacticalrmm/core/tests.py

+139
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,6 @@
11
from unittest.mock import patch
22

3+
import tempfile
34
import requests
45
from channels.db import database_sync_to_async
56
from channels.testing import WebsocketCommunicator
@@ -500,3 +501,141 @@ def test_get_meshagent_url_docker(self):
500501
r,
501502
"http://tactical-meshcentral:4443/meshagents?id=4&meshid=abc123&installflags=0",
502503
)
504+
505+
506+
class TestMonitoring(TacticalTestCase):
507+
url = "/core/status/"
508+
509+
def setUp(self):
510+
self.setup_client()
511+
self.setup_coresettings()
512+
513+
# sample data for generated metrics
514+
client1 = baker.make("clients.Client")
515+
client1_site1 = baker.make("clients.Site", client=client1)
516+
client1_site2 = baker.make("clients.Site", client=client1)
517+
agent1 = baker.make("agents.agent", _quantity=10, site=client1_site1)
518+
agent2 = baker.make("agents.agent", _quantity=13, site=client1_site2)
519+
client2 = baker.make("clients.Client")
520+
client2_site1 = baker.make("clients.Site", client=client2)
521+
agent3 = baker.make("agents.agent", _quantity=13, site=client2_site1)
522+
523+
# Generate snakeoil cert with `make-ssl-cert generate-default-snakeoil` on ubuntu
524+
# Cert will be in '/etc/ssl/certs/ssl-cert-snakeoil.pem'.
525+
# Cert is used only for expiration date, so it can be selfsigned, expired and no key is needed.
526+
self.snakeoil_certificate = tempfile.NamedTemporaryFile(delete=False)
527+
self.snakeoil_certificate.write(
528+
"""-----BEGIN CERTIFICATE-----
529+
MIIC4jCCAcqgAwIBAgIUCFgTym78sGgRHwEmLyGgmr1JjSUwDQYJKoZIhvcNAQEL
530+
BQAwFzEVMBMGA1UEAwwMZjNjMTQzOWM0NzZjMB4XDTIzMDMzMTA1MTgzOFoXDTMz
531+
MDMyODA1MTgzOFowFzEVMBMGA1UEAwwMZjNjMTQzOWM0NzZjMIIBIjANBgkqhkiG
532+
9w0BAQEFAAOCAQ8AMIIBCgKCAQEAzFWItB4aM/aUWIhk0SS1XKHLHao9/OwbGHet
533+
lnrlZD2YM/DdUzqdYeYdujyLvWUj1xU+YcFv+vo3Mmu8HQVOKNcEZ5ZilHW/87X8
534+
6ZjtUzPYmCapxXNTX8yh2EES582uq64j0t3OwfaCJmpJLwjvCnrizfUFe76iy5Ge
535+
wVviYtkaIfHEwNoJLmFb07rYhNuV4tiwHUhmZqqm5nxpjKbTsI4YHnpSxNktU32C
536+
vNVnIRIAHDZ8n8wCaKTPZMui9X/IJx1pA3EkbD2givbH/0nYRcd5ZUDxLsTJThob
537+
8k5kPd1zVXqaH/ufqkekqoiY+kIWsgVd0iWx3qihhydAhRY5SQIDAQABoyYwJDAJ
538+
BgNVHRMEAjAAMBcGA1UdEQQQMA6CDGYzYzE0MzljNDc2YzANBgkqhkiG9w0BAQsF
539+
AAOCAQEAH91bAuK3tKf1v4D+t48SWSE2uFjCe6o2CzMwAdM3rVa47X2cw5nKOH5L
540+
8nQJhJjq/t93DJi4WOpN579NWtTkwXyCl7srSvj8aK4FDKxKcWQNT1PUAa+gh8IB
541+
WJdEK4lMSatCtA/wsq6jmkTwINZ/ELZp4BRU2gUp8mFU9fVQDMlY+2qwUzzIp97A
542+
WISWVxML58FDFnQLsaP1SfapVWTTXTh4xnhr7VxklUadcGRnx9+Ig4Ieq27eSCiV
543+
DC/aSRIyi9HaVZPTMbqLC50auHr/dQIL4pGyxFTD8OJoeRkQgAb1wWuAPhab20Xu
544+
XyFzZMiRlyNNSPoYVExb65s1bawqew==
545+
-----END CERTIFICATE-----""".encode(
546+
encoding="utf-8"
547+
)
548+
)
549+
self.snakeoil_certificate.close()
550+
551+
def tearDown(self):
552+
from os import unlink
553+
554+
unlink(self.snakeoil_certificate.name)
555+
556+
# prometheus tests
557+
def test_prometheus_missing_auth_header_request(self):
558+
r = self.client.get(self.url)
559+
self.assertEqual(r.status_code, 400)
560+
561+
def test_prometheus_missing_token_config(self):
562+
r = self.client.get(self.url, HTTP_Authorization="Bearer MySuperTestSecret")
563+
self.assertEqual(r.status_code, 401)
564+
565+
@override_settings(MON_TOKEN="MySuperTestSecret")
566+
def test_prometheus_incorrect_token_request(self):
567+
r = self.client.get(self.url, HTTP_Authorization="Bearer NotMySuperTestSecret")
568+
self.assertEqual(r.status_code, 401)
569+
570+
@override_settings(DOCKER_BUILD=True, MON_TOKEN="MySuperTestSecret")
571+
def test_prometheus_correct_docker_build_request(self):
572+
with self.settings(
573+
CERT_FILE=self.snakeoil_certificate.name, KEY_FILE="/do/not/need/a/key/here"
574+
):
575+
r = self.client.get(self.url, HTTP_Authorization="Bearer MySuperTestSecret")
576+
self.assertEqual(r.status_code, 200)
577+
578+
@override_settings(MON_TOKEN="MySuperTestSecret")
579+
def test_prometheus_correct_request(self):
580+
with self.settings(
581+
CERT_FILE=self.snakeoil_certificate.name, KEY_FILE="/do/not/need/a/key/here"
582+
):
583+
r = self.client.get(self.url, HTTP_Authorization="Bearer MySuperTestSecret")
584+
self.assertEqual(r.status_code, 200)
585+
586+
# invalid tests
587+
def test_invalid_request(self):
588+
r = self.client.put(self.url)
589+
self.assertEqual(r.status_code, 400)
590+
self.assertEqual(
591+
r.content,
592+
b"Invalid request type\n",
593+
)
594+
595+
# json tests
596+
def test_json_invalid_json_request(self):
597+
r = self.client.post(
598+
self.url,
599+
data="I am not json!",
600+
content_type="application/json",
601+
)
602+
self.assertEqual(r.status_code, 400)
603+
604+
def test_json_invalid_payload_request(self):
605+
r = self.client.post(
606+
self.url, data={"notauth": "NotMySuperTestSecret"}, format="json"
607+
)
608+
self.assertEqual(r.status_code, 400)
609+
610+
def test_json_missing_token_request(self):
611+
r = self.client.post(
612+
self.url, data={"auth": "MySuperTestSecret"}, format="json"
613+
)
614+
self.assertEqual(r.status_code, 401)
615+
616+
@override_settings(MON_TOKEN="MySuperTestSecret")
617+
def test_json_incorrect_token_request(self):
618+
r = self.client.post(
619+
self.url, data={"auth": "NotMySuperTestSecret"}, format="json"
620+
)
621+
self.assertEqual(r.status_code, 401)
622+
623+
@override_settings(MON_TOKEN="MySuperTestSecret")
624+
def test_json_correct_request(self):
625+
with self.settings(
626+
CERT_FILE=self.snakeoil_certificate.name, KEY_FILE="/do/not/need/a/key/here"
627+
):
628+
r = self.client.post(
629+
self.url, data={"auth": "MySuperTestSecret"}, format="json"
630+
)
631+
self.assertEqual(r.status_code, 200)
632+
633+
@override_settings(DOCKER_BUILD=True, MON_TOKEN="MySuperTestSecret")
634+
def test_json_correct_docker_build_request(self):
635+
with self.settings(
636+
CERT_FILE=self.snakeoil_certificate.name, KEY_FILE="/do/not/need/a/key/here"
637+
):
638+
r = self.client.post(
639+
self.url, data={"auth": "MySuperTestSecret"}, format="json"
640+
)
641+
self.assertEqual(r.status_code, 200)

api/tacticalrmm/core/views.py

+145-35
Original file line numberDiff line numberDiff line change
@@ -5,7 +5,7 @@
55
import pytz
66
from cryptography import x509
77
from django.conf import settings
8-
from django.http import JsonResponse
8+
from django.http import JsonResponse, HttpResponse
99
from django.shortcuts import get_object_or_404
1010
from django.utils import timezone as djangotime
1111
from django.views.decorators.csrf import csrf_exempt
@@ -409,43 +409,153 @@ def status(request):
409409
from agents.models import Agent
410410
from clients.models import Client, Site
411411

412-
disk_usage: int = round(psutil.disk_usage("/").percent)
413-
mem_usage: int = round(psutil.virtual_memory().percent)
414-
412+
# common metrics bits
415413
cert_file, _ = get_certs()
416414
cert_bytes = Path(cert_file).read_bytes()
417-
418415
cert = x509.load_pem_x509_certificate(cert_bytes)
419-
expires = pytz.utc.localize(cert.not_valid_after)
420-
now = djangotime.now()
421-
delta = expires - now
422-
423-
ret = {
424-
"version": settings.TRMM_VERSION,
425-
"latest_agent_version": settings.LATEST_AGENT_VER,
426-
"agent_count": Agent.objects.count(),
427-
"client_count": Client.objects.count(),
428-
"site_count": Site.objects.count(),
429-
"disk_usage_percent": disk_usage,
430-
"mem_usage_percent": mem_usage,
431-
"days_until_cert_expires": delta.days,
432-
"cert_expired": delta.days < 0,
416+
417+
# common services
418+
services = {
419+
"django": "rmm.service",
420+
"mesh": "meshcentral.service",
421+
"daphne": "daphne.service",
422+
"celery": "celery.service",
423+
"celerybeat": "celerybeat.service",
424+
"redis": "redis-server.service",
425+
"postgres": "postgresql.service",
426+
"mongo": "mongod.service",
427+
"nats": "nats.service",
428+
"nats-api": "nats-api.service",
429+
"nginx": "nginx.service",
433430
}
434431

435-
if settings.DOCKER_BUILD:
436-
ret["services_running"] = "not available in docker"
437-
else:
438-
ret["services_running"] = {
439-
"django": sysd_svc_is_running("rmm.service"),
440-
"mesh": sysd_svc_is_running("meshcentral.service"),
441-
"daphne": sysd_svc_is_running("daphne.service"),
442-
"celery": sysd_svc_is_running("celery.service"),
443-
"celerybeat": sysd_svc_is_running("celerybeat.service"),
444-
"redis": sysd_svc_is_running("redis-server.service"),
445-
"postgres": sysd_svc_is_running("postgresql.service"),
446-
"mongo": sysd_svc_is_running("mongod.service"),
447-
"nats": sysd_svc_is_running("nats.service"),
448-
"nats-api": sysd_svc_is_running("nats-api.service"),
449-
"nginx": sysd_svc_is_running("nginx.service"),
432+
# TRMM json monitoring
433+
if request.method == "POST":
434+
disk_usage: int = round(psutil.disk_usage("/").percent)
435+
mem_usage: int = round(psutil.virtual_memory().percent)
436+
437+
cert_expires = pytz.utc.localize(cert.not_valid_after)
438+
now = djangotime.now()
439+
delta = cert_expires - now
440+
441+
ret = {
442+
"version": settings.TRMM_VERSION,
443+
"latest_agent_version": settings.LATEST_AGENT_VER,
444+
"agent_count": Agent.objects.count(),
445+
"client_count": Client.objects.count(),
446+
"site_count": Site.objects.count(),
447+
"disk_usage_percent": disk_usage,
448+
"mem_usage_percent": mem_usage,
449+
"days_until_cert_expires": delta.days,
450+
"cert_expired": delta.days < 0,
450451
}
451-
return JsonResponse(ret, json_dumps_params={"indent": 2})
452+
453+
if settings.DOCKER_BUILD:
454+
ret["services_running"] = "not available in docker"
455+
else:
456+
ret["services_running"] = {}
457+
for k, v in services.items():
458+
ret["services_running"][k] = sysd_svc_is_running(v)
459+
return JsonResponse(ret, json_dumps_params={"indent": 2})
460+
461+
# TRMM Prometheus monitoring
462+
elif request.method == "GET":
463+
# get agent counts
464+
from clients.serializers import ClientSerializer
465+
from django.db.models import Count, Prefetch
466+
467+
agent_counts = ClientSerializer(
468+
Client.objects.order_by("name").prefetch_related(
469+
Prefetch(
470+
"sites",
471+
queryset=Site.objects.order_by("name")
472+
.select_related("client")
473+
.annotate(agent_count=Count("agents")),
474+
to_attr="filtered_sites",
475+
)
476+
),
477+
many=True,
478+
).data
479+
480+
# generate agent count metrics
481+
agent_count_metrics = []
482+
for client in agent_counts:
483+
for site in client["sites"]:
484+
agent_count_metrics.append(
485+
(
486+
{"client": client["name"], "site": site["name"]},
487+
site["agent_count"],
488+
)
489+
)
490+
491+
# create base prometheus metric dataset
492+
metrics = {
493+
"trmm_buildinfo": {
494+
"type": "gauge",
495+
"help": "trmm version",
496+
"entries": [({"version": settings.TRMM_VERSION}, 1)],
497+
},
498+
"trmm_meshinfo": {
499+
"type": "gauge",
500+
"help": "meshcentral version",
501+
"entries": [({"version": settings.MESH_VER}, 1)],
502+
},
503+
"trmm_natsinfo": {
504+
"type": "gauge",
505+
"help": "nats version",
506+
"entries": [({"version": settings.NATS_SERVER_VER}, 1)],
507+
},
508+
"trmm_appinfo": {
509+
"type": "gauge",
510+
"help": "vue version",
511+
"entries": [({"version": settings.APP_VER}, 1)],
512+
},
513+
"trmm_agentinfo": {
514+
"type": "gauge",
515+
"help": "latest version of trmm agent",
516+
"entries": [({"version": settings.LATEST_AGENT_VER}, 1)],
517+
},
518+
"trmm_agents": {
519+
"type": "gauge",
520+
"help": "number of registered agents in trmm",
521+
"entries": agent_count_metrics,
522+
},
523+
"trmm_cert_expiry": {
524+
"type": "gauge",
525+
"help": "unix timestamp of certificate expiration",
526+
"entries": [({}, cert.not_valid_after.timestamp())],
527+
},
528+
}
529+
530+
# add service metrics if this is not a docker build
531+
if not settings.DOCKER_BUILD:
532+
e = []
533+
for k, v in services.items():
534+
e.append(({"name": v, "service": k}, int(sysd_svc_is_running(v))))
535+
536+
metrics["trmm_systemd_unit_state"] = {
537+
"type": "gauge",
538+
"help": "trmm service status for non docker builds",
539+
"entries": e,
540+
}
541+
542+
# render prometheus metrics
543+
payload = ""
544+
for metric, data in metrics.items():
545+
# create help and type hints
546+
if "help" in data:
547+
payload += "# HELP {} {}\n".format(metric, data["help"])
548+
payload += "# TYPE {} {}\n".format(metric, data["type"])
549+
# populate the metrics
550+
for labels, value in data["entries"]:
551+
label_string = ",".join(
552+
['{}="{}"'.format(i[0], i[1]) for i in labels.items()]
553+
)
554+
if label_string != "":
555+
label_string = "{{{}}}".format(label_string)
556+
payload += "{}{} {}\n".format(metric, label_string, value)
557+
return HttpResponse(payload, content_type="text/plain")
558+
559+
# The monitoring_view decorator should prevent this state from ever occuring.
560+
else:
561+
return HttpResponse("It should not be possible to be here.\n", status=500)

0 commit comments

Comments
 (0)