From 070e7996199047edef6aac4ae624bea8c1600084 Mon Sep 17 00:00:00 2001 From: Gabriel Salla Date: Fri, 13 Feb 2026 09:10:09 -0300 Subject: [PATCH 1/4] add http server port to docs --- README.md | 2 +- docs/http_server.md | 3 +++ 2 files changed, 4 insertions(+), 1 deletion(-) diff --git a/README.md b/README.md index a2739f35..41ad2e24 100644 --- a/README.md +++ b/README.md @@ -88,7 +88,7 @@ Common use cases: State machine-related issues often require several data checks and conditional logic to identify. These issues are typically difficult to capture using standard logs and metrics but can be easily addressed using Sentinela Monitoring. # Dashboard -Sentinela provides a web dashboard with 2 sections: +Sentinela provides a web dashboard, by default at port `8000`, with 2 sections: 1. an overview of the monitors and their alerts and issues 2. a monitor editor, where you can create and edit monitors directly from the browser diff --git a/docs/http_server.md b/docs/http_server.md index 35b2e80e..11bb02f5 100644 --- a/docs/http_server.md +++ b/docs/http_server.md @@ -1,6 +1,9 @@ # HTTP server The HTTP server provides an API to interact with Sentinela. The available routes are organized into two main categories, based on the deployment setup. +> [!IMPORTANT] +> By default the API is served at port `8000`. The docker compose files also expose the port `8000`, so if the port for the server changes, the compose files should be updated accordingly. Another option is to keep the server port at `8000` and changing only the compose files. Using the configuration `8080:8000`, for example, will keep the server running at port `8000`, but it will be accessible through the container's port `8080`. + If the container is deployed with the **Controller** (either standalone or alongside the Executor in the same container), all routes are available, allowing interactions with Monitors, Issues, Alerts and the dashboard. If the container is deployed with only the **Executor**, only base routes are available. From 9fe8a5290ff2dfb1953ad1a54bf2968c904d9d5a Mon Sep 17 00:00:00 2001 From: Gabriel Salla Date: Wed, 11 Feb 2026 19:25:20 -0300 Subject: [PATCH 2/4] add heartbeat settings to configs --- configs/configs-scalable.yaml | 2 ++ configs/configs.yaml | 2 ++ docs/configuration_file.md | 15 ++++++++++----- resources/kubernetes_template/config_map.yaml | 2 ++ src/configs/configs_loader.py | 2 ++ 5 files changed, 18 insertions(+), 5 deletions(-) diff --git a/configs/configs-scalable.yaml b/configs/configs-scalable.yaml index a8f5fe30..116f98e1 100644 --- a/configs/configs-scalable.yaml +++ b/configs/configs-scalable.yaml @@ -38,6 +38,8 @@ http_server: time_zone: America/Sao_Paulo +heartbeat_time: 2 + controller_process_schedule: "* * * * *" controller_concurrency: 5 controller_procedures: diff --git a/configs/configs.yaml b/configs/configs.yaml index e612a65f..fc102cfb 100644 --- a/configs/configs.yaml +++ b/configs/configs.yaml @@ -33,6 +33,8 @@ http_server: time_zone: America/Sao_Paulo +heartbeat_time: 2 + controller_process_schedule: "* * * * *" controller_concurrency: 5 controller_procedures: diff --git a/docs/configuration_file.md b/docs/configuration_file.md index bcde4b2d..f67ae079 100644 --- a/docs/configuration_file.md +++ b/docs/configuration_file.md @@ -65,14 +65,19 @@ application_queue: ## Time Zone - `time_zone`: String. Time zone to use for cron scheduling and notification messages. +## Heartbeat +- `heartbeat_time`: Integer. Time, in seconds, between each heartbeat. This heartbeat is used to identify when a task is not yielding the control back to the event loop for too much time, generating a Warning log. + ## Controller Settings - `controller_process_schedule`: String using Cron format. Schedule to check if monitors need to be processed. - `controller_concurrency`: Integer. Number of monitors that can be processed at the same time by the Controller. - `controller_procedures`: Map. Procedures to be executed by the Controller and their settings. -- `controller_procedures.monitors_stuck`: Map. Settings for the procedure to fix monitors stuck in "queued" or "running" status. -- `controller_procedures.monitors_stuck.schedule`: String using Cron format. Schedule to execute the `monitors_stuck` procedure. -- `controller_procedures.monitors_stuck.params.time_tolerance`: Integer. Time tolerance in seconds for a monitor to be considered as stuck. This parameter is directly impacted by the `executor_monitor_heartbeat_time` setting and the recommended value is 2 times the heartbeat time. -- `controller_procedures.notifications_alert_solved.schedule`: String using Cron format. Schedule to execute the `notifications_alert_solved` procedure. + - `monitors_stuck`: Map. Settings for the procedure to fix monitors stuck in "queued" or "running" status. + - `schedule`: String using Cron format. Schedule to execute the `monitors_stuck` procedure. + - `params`: Map. Configuration parameters for the `monitors_stuck` procedure. + - `time_tolerance`: Integer. Time tolerance in seconds for a monitor to be considered as stuck. This parameter is directly impacted by the `executor_monitor_heartbeat_time` setting and the recommended value is 2 times the heartbeat time. + - `notifications_alert_solved`: Map. Settings for the procedure to identify and fix active notifications linked to alerts that have already been solved. + - `schedule`: String using Cron format. Schedule to execute the `notifications_alert_solved` procedure. ## Executor Settings - `executor_concurrency`: Integer. Number of tasks that can be executed at the same time by each Executor. @@ -80,7 +85,7 @@ application_queue: - `executor_monitor_timeout`: Integer. Timeout, in seconds, for monitor execution. - `executor_reaction_timeout`: Integer. Timeout, in seconds, for reactions execution. - `executor_request_timeout`: Integer. Timeout, in seconds, for requests execution. -- `executor_monitor_heartbeat_time`: Integer. Time, in seconds, between each monitor heartbeat. This parameter impacts the controller procedure `monitors_stuck.time_tolerance` parameter. +- `executor_monitor_heartbeat_time`: Integer. Time, in seconds, between each executor heartbeat during monitor execution. This parameter impacts the controller procedure `monitors_stuck.time_tolerance` parameter. ## Issues Creation - `max_issues_creation`: Integer. Maximum number of issues that can be created by each monitor in a single search. Can be overridden by the monitors' configuration. diff --git a/resources/kubernetes_template/config_map.yaml b/resources/kubernetes_template/config_map.yaml index 394900b3..7aaa9ea8 100644 --- a/resources/kubernetes_template/config_map.yaml +++ b/resources/kubernetes_template/config_map.yaml @@ -44,6 +44,8 @@ data: time_zone: America/Sao_Paulo + heartbeat_time: 2 + controller_process_schedule: "* * * * *" controller_concurrency: 5 controller_procedures: diff --git a/src/configs/configs_loader.py b/src/configs/configs_loader.py index de831604..9e4fd8dc 100644 --- a/src/configs/configs_loader.py +++ b/src/configs/configs_loader.py @@ -62,6 +62,8 @@ class Configs: time_zone: str + heartbeat_time: int + controller_process_schedule: str controller_concurrency: int controller_procedures: dict[str, ControllerProcedureConfig] From 98d0c3fb2a054832f0fcc6b6d5b98596e01d3dbb Mon Sep 17 00:00:00 2001 From: Gabriel Salla Date: Wed, 11 Feb 2026 19:25:54 -0300 Subject: [PATCH 3/4] create heartbeat component --- docs/monitoring_sentinela.md | 1 + src/components/heartbeat/__init__.py | 5 ++ src/components/heartbeat/heartbeat.py | 49 +++++++++++++++ tests/components/heartbeat/test_heartbeat.py | 63 ++++++++++++++++++++ 4 files changed, 118 insertions(+) create mode 100644 src/components/heartbeat/__init__.py create mode 100644 src/components/heartbeat/heartbeat.py create mode 100644 tests/components/heartbeat/test_heartbeat.py diff --git a/docs/monitoring_sentinela.md b/docs/monitoring_sentinela.md index f7605671..424daefa 100644 --- a/docs/monitoring_sentinela.md +++ b/docs/monitoring_sentinela.md @@ -48,5 +48,6 @@ The Prometheus metrics provided by Sentinela are: - Labels: `action_name` - `executor_request_execution_seconds`: Summary - Time to run the request - Labels: `action_name` +- `heartbeat_average_time`: Gauge - Average time between heartbeats in seconds - `registry_monitors_ready_timeout_count`: Counter - Count of times the application timed out waiting for monitors to be ready - `registry_monitor_not_registered_count`: Counter - Count of times a monitor is not registered after a load attempt diff --git a/src/components/heartbeat/__init__.py b/src/components/heartbeat/__init__.py new file mode 100644 index 00000000..b9684892 --- /dev/null +++ b/src/components/heartbeat/__init__.py @@ -0,0 +1,5 @@ +from .heartbeat import run + +__all__ = [ + "run", +] diff --git a/src/components/heartbeat/heartbeat.py b/src/components/heartbeat/heartbeat.py new file mode 100644 index 00000000..9f407eda --- /dev/null +++ b/src/components/heartbeat/heartbeat.py @@ -0,0 +1,49 @@ +import logging +import time +from collections import deque +from itertools import pairwise + +import prometheus_client + +import utils.app as app +from configs import configs + +_logger = logging.getLogger("heartbeat") + +prometheus_heartbeat_average_time = prometheus_client.Gauge( + "heartbeat_average_time", "Average time between heartbeats in seconds" +) + + +def _is_heartbeat_delayed(timestamps: deque[float], threshold: float) -> bool: + """Determine if the heartbeat is delayed based on the average latency between timestamps""" + if len(timestamps) < 2: + return False + + latencies = [b - a for a, b in pairwise(timestamps)] + average_latency = sum(latencies) / len(latencies) + prometheus_heartbeat_average_time.set(average_latency) + return average_latency > threshold + + +async def run() -> None: + """Create a heartbeat for the application to detect when some tasks are not yielding control + back to the event loop. If the heartbeat is delayed, a warning message is logged.""" + timestamps = deque[float](maxlen=10) + last_warning_timestamp = 0.0 + + while app.running(): + timestamp = time.time() + timestamps.append(timestamp) + heartbeat_delayed = _is_heartbeat_delayed(timestamps, configs.heartbeat_time * 1.05) + + # Prevent warning messages from being sent too frequently + can_warn = timestamp - last_warning_timestamp > 10 + if can_warn and heartbeat_delayed: + _logger.warning( + "High average heartbeat interval. " + "Blocking operations are preventing tasks from executing" + ) + last_warning_timestamp = timestamp + + await app.sleep(configs.heartbeat_time) diff --git a/tests/components/heartbeat/test_heartbeat.py b/tests/components/heartbeat/test_heartbeat.py new file mode 100644 index 00000000..e3d1d10c --- /dev/null +++ b/tests/components/heartbeat/test_heartbeat.py @@ -0,0 +1,63 @@ +import asyncio +import time +from collections import deque +from unittest.mock import MagicMock + +import pytest + +import components.heartbeat.heartbeat as heartbeat +import utils.app as app +from configs import configs + +pytestmark = pytest.mark.asyncio(loop_scope="session") + + +@pytest.mark.parametrize( + "timestamps, threshold, expected_result", + [ + (deque([100]), 1, False), + (deque([100, 101]), 1, False), + (deque([100, 102]), 1, True), + (deque([1, 3, 5, 7, 9]), 2, False), + (deque([1, 3, 5, 7, 10]), 2, True), + ], +) +async def test_is_heartbeat_delayed(timestamps, threshold, expected_result): + """'is_heartbeat_delayed' should return True when average latency exceeds threshold""" + assert heartbeat._is_heartbeat_delayed(timestamps, threshold) is expected_result + + +async def test_run(mocker, monkeypatch): + """'run' should append the current timestamp while app is running""" + monkeypatch.setattr(configs, "heartbeat_time", 0.05) + heartbeat_logger_warning_spy: MagicMock = mocker.spy(heartbeat._logger, "warning") + + task = asyncio.create_task(heartbeat.run()) + + await asyncio.sleep(0.1) + assert heartbeat_logger_warning_spy.call_count == 0 + time.sleep(0.1) + await asyncio.sleep(0) + assert heartbeat_logger_warning_spy.call_count == 1 + + app.stop() + await asyncio.wait_for(task, timeout=0.1) + + +async def test_run_cooldown(mocker, monkeypatch): + """'run' should respect the cooldown period between warnings when heartbeat is delayed""" + monkeypatch.setattr(configs, "heartbeat_time", 0.05) + heartbeat_logger_warning_spy: MagicMock = mocker.spy(heartbeat._logger, "warning") + + task = asyncio.create_task(heartbeat.run()) + + await asyncio.sleep(0.1) + time.sleep(0.1) + await asyncio.sleep(0) + assert heartbeat_logger_warning_spy.call_count == 1 + time.sleep(0.1) + await asyncio.sleep(0) + assert heartbeat_logger_warning_spy.call_count == 1 + + app.stop() + await asyncio.wait_for(task, timeout=0.1) From 30092cb6b4b75c8be423b7ce18933363bed07da0 Mon Sep 17 00:00:00 2001 From: Gabriel Salla Date: Thu, 12 Feb 2026 09:49:44 -0300 Subject: [PATCH 4/4] add heartbeat monitoring task --- src/main.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/src/main.py b/src/main.py index 4aeaa5ef..3db31a6c 100644 --- a/src/main.py +++ b/src/main.py @@ -6,6 +6,7 @@ import components.controller as controller import components.executor as executor +import components.heartbeat as heartbeat import components.http_server as http_server import components.monitors_loader as monitors_loader import components.task_manager as task_manager @@ -86,6 +87,7 @@ async def main() -> None: for mode in operation_modes: task_manager.create_task(modes[mode]()) + task_manager.create_task(heartbeat.run()) task_manager.create_task(monitors_loader.run()) await task_manager.run()