|
5 | 5 | import pytz
|
6 | 6 | from cryptography import x509
|
7 | 7 | from django.conf import settings
|
8 |
| -from django.http import JsonResponse |
| 8 | +from django.http import JsonResponse, HttpResponse |
9 | 9 | from django.shortcuts import get_object_or_404
|
10 | 10 | from django.utils import timezone as djangotime
|
11 | 11 | from django.views.decorators.csrf import csrf_exempt
|
@@ -409,43 +409,153 @@ def status(request):
|
409 | 409 | from agents.models import Agent
|
410 | 410 | from clients.models import Client, Site
|
411 | 411 |
|
412 |
| - disk_usage: int = round(psutil.disk_usage("/").percent) |
413 |
| - mem_usage: int = round(psutil.virtual_memory().percent) |
414 |
| - |
| 412 | + # common metrics bits |
415 | 413 | cert_file, _ = get_certs()
|
416 | 414 | cert_bytes = Path(cert_file).read_bytes()
|
417 |
| - |
418 | 415 | cert = x509.load_pem_x509_certificate(cert_bytes)
|
419 |
| - expires = pytz.utc.localize(cert.not_valid_after) |
420 |
| - now = djangotime.now() |
421 |
| - delta = expires - now |
422 |
| - |
423 |
| - ret = { |
424 |
| - "version": settings.TRMM_VERSION, |
425 |
| - "latest_agent_version": settings.LATEST_AGENT_VER, |
426 |
| - "agent_count": Agent.objects.count(), |
427 |
| - "client_count": Client.objects.count(), |
428 |
| - "site_count": Site.objects.count(), |
429 |
| - "disk_usage_percent": disk_usage, |
430 |
| - "mem_usage_percent": mem_usage, |
431 |
| - "days_until_cert_expires": delta.days, |
432 |
| - "cert_expired": delta.days < 0, |
| 416 | + |
| 417 | + # common services |
| 418 | + services = { |
| 419 | + "django": "rmm.service", |
| 420 | + "mesh": "meshcentral.service", |
| 421 | + "daphne": "daphne.service", |
| 422 | + "celery": "celery.service", |
| 423 | + "celerybeat": "celerybeat.service", |
| 424 | + "redis": "redis-server.service", |
| 425 | + "postgres": "postgresql.service", |
| 426 | + "mongo": "mongod.service", |
| 427 | + "nats": "nats.service", |
| 428 | + "nats-api": "nats-api.service", |
| 429 | + "nginx": "nginx.service", |
433 | 430 | }
|
434 | 431 |
|
435 |
| - if settings.DOCKER_BUILD: |
436 |
| - ret["services_running"] = "not available in docker" |
437 |
| - else: |
438 |
| - ret["services_running"] = { |
439 |
| - "django": sysd_svc_is_running("rmm.service"), |
440 |
| - "mesh": sysd_svc_is_running("meshcentral.service"), |
441 |
| - "daphne": sysd_svc_is_running("daphne.service"), |
442 |
| - "celery": sysd_svc_is_running("celery.service"), |
443 |
| - "celerybeat": sysd_svc_is_running("celerybeat.service"), |
444 |
| - "redis": sysd_svc_is_running("redis-server.service"), |
445 |
| - "postgres": sysd_svc_is_running("postgresql.service"), |
446 |
| - "mongo": sysd_svc_is_running("mongod.service"), |
447 |
| - "nats": sysd_svc_is_running("nats.service"), |
448 |
| - "nats-api": sysd_svc_is_running("nats-api.service"), |
449 |
| - "nginx": sysd_svc_is_running("nginx.service"), |
| 432 | + # TRMM json monitoring |
| 433 | + if request.method == "POST": |
| 434 | + disk_usage: int = round(psutil.disk_usage("/").percent) |
| 435 | + mem_usage: int = round(psutil.virtual_memory().percent) |
| 436 | + |
| 437 | + cert_expires = pytz.utc.localize(cert.not_valid_after) |
| 438 | + now = djangotime.now() |
| 439 | + delta = cert_expires - now |
| 440 | + |
| 441 | + ret = { |
| 442 | + "version": settings.TRMM_VERSION, |
| 443 | + "latest_agent_version": settings.LATEST_AGENT_VER, |
| 444 | + "agent_count": Agent.objects.count(), |
| 445 | + "client_count": Client.objects.count(), |
| 446 | + "site_count": Site.objects.count(), |
| 447 | + "disk_usage_percent": disk_usage, |
| 448 | + "mem_usage_percent": mem_usage, |
| 449 | + "days_until_cert_expires": delta.days, |
| 450 | + "cert_expired": delta.days < 0, |
450 | 451 | }
|
451 |
| - return JsonResponse(ret, json_dumps_params={"indent": 2}) |
| 452 | + |
| 453 | + if settings.DOCKER_BUILD: |
| 454 | + ret["services_running"] = "not available in docker" |
| 455 | + else: |
| 456 | + ret["services_running"] = {} |
| 457 | + for k, v in services.items(): |
| 458 | + ret["services_running"][k] = sysd_svc_is_running(v) |
| 459 | + return JsonResponse(ret, json_dumps_params={"indent": 2}) |
| 460 | + |
| 461 | + # TRMM Prometheus monitoring |
| 462 | + elif request.method == "GET": |
| 463 | + # get agent counts |
| 464 | + from clients.serializers import ClientSerializer |
| 465 | + from django.db.models import Count, Prefetch |
| 466 | + |
| 467 | + agent_counts = ClientSerializer( |
| 468 | + Client.objects.order_by("name").prefetch_related( |
| 469 | + Prefetch( |
| 470 | + "sites", |
| 471 | + queryset=Site.objects.order_by("name") |
| 472 | + .select_related("client") |
| 473 | + .annotate(agent_count=Count("agents")), |
| 474 | + to_attr="filtered_sites", |
| 475 | + ) |
| 476 | + ), |
| 477 | + many=True, |
| 478 | + ).data |
| 479 | + |
| 480 | + # generate agent count metrics |
| 481 | + agent_count_metrics = [] |
| 482 | + for client in agent_counts: |
| 483 | + for site in client["sites"]: |
| 484 | + agent_count_metrics.append( |
| 485 | + ( |
| 486 | + {"client": client["name"], "site": site["name"]}, |
| 487 | + site["agent_count"], |
| 488 | + ) |
| 489 | + ) |
| 490 | + |
| 491 | + # create base prometheus metric dataset |
| 492 | + metrics = { |
| 493 | + "trmm_buildinfo": { |
| 494 | + "type": "gauge", |
| 495 | + "help": "trmm version", |
| 496 | + "entries": [({"version": settings.TRMM_VERSION}, 1)], |
| 497 | + }, |
| 498 | + "trmm_meshinfo": { |
| 499 | + "type": "gauge", |
| 500 | + "help": "meshcentral version", |
| 501 | + "entries": [({"version": settings.MESH_VER}, 1)], |
| 502 | + }, |
| 503 | + "trmm_natsinfo": { |
| 504 | + "type": "gauge", |
| 505 | + "help": "nats version", |
| 506 | + "entries": [({"version": settings.NATS_SERVER_VER}, 1)], |
| 507 | + }, |
| 508 | + "trmm_appinfo": { |
| 509 | + "type": "gauge", |
| 510 | + "help": "vue version", |
| 511 | + "entries": [({"version": settings.APP_VER}, 1)], |
| 512 | + }, |
| 513 | + "trmm_agentinfo": { |
| 514 | + "type": "gauge", |
| 515 | + "help": "latest version of trmm agent", |
| 516 | + "entries": [({"version": settings.LATEST_AGENT_VER}, 1)], |
| 517 | + }, |
| 518 | + "trmm_agents": { |
| 519 | + "type": "gauge", |
| 520 | + "help": "number of registered agents in trmm", |
| 521 | + "entries": agent_count_metrics, |
| 522 | + }, |
| 523 | + "trmm_cert_expiry": { |
| 524 | + "type": "gauge", |
| 525 | + "help": "unix timestamp of certificate expiration", |
| 526 | + "entries": [({}, cert.not_valid_after.timestamp())], |
| 527 | + }, |
| 528 | + } |
| 529 | + |
| 530 | + # add service metrics if this is not a docker build |
| 531 | + if not settings.DOCKER_BUILD: |
| 532 | + e = [] |
| 533 | + for k, v in services.items(): |
| 534 | + e.append(({"name": v, "service": k}, int(sysd_svc_is_running(v)))) |
| 535 | + |
| 536 | + metrics["trmm_systemd_unit_state"] = { |
| 537 | + "type": "gauge", |
| 538 | + "help": "trmm service status for non docker builds", |
| 539 | + "entries": e, |
| 540 | + } |
| 541 | + |
| 542 | + # render prometheus metrics |
| 543 | + payload = "" |
| 544 | + for metric, data in metrics.items(): |
| 545 | + # create help and type hints |
| 546 | + if "help" in data: |
| 547 | + payload += "# HELP {} {}\n".format(metric, data["help"]) |
| 548 | + payload += "# TYPE {} {}\n".format(metric, data["type"]) |
| 549 | + # populate the metrics |
| 550 | + for labels, value in data["entries"]: |
| 551 | + label_string = ",".join( |
| 552 | + ['{}="{}"'.format(i[0], i[1]) for i in labels.items()] |
| 553 | + ) |
| 554 | + if label_string != "": |
| 555 | + label_string = "{{{}}}".format(label_string) |
| 556 | + payload += "{}{} {}\n".format(metric, label_string, value) |
| 557 | + return HttpResponse(payload, content_type="text/plain") |
| 558 | + |
| 559 | + # The monitoring_view decorator should prevent this state from ever occuring. |
| 560 | + else: |
| 561 | + return HttpResponse("It should not be possible to be here.\n", status=500) |
0 commit comments