Skip to content

Commit 208ff80

Browse files
committed
Try another perf fix
1 parent aa9e73c commit 208ff80

5 files changed

Lines changed: 302 additions & 40 deletions

File tree

docs/changelog.md

Lines changed: 14 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -6,6 +6,20 @@ The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.1.0/).
66

77
Each revision is versioned by the date of the revision.
88

9+
## 2026-06-22
10+
11+
- Fix a performance issue in `reconcile` where the hook blocked for ~30 seconds while the
12+
Wazuh API was still starting up. The Wazuh API client now uses a tight retry/backoff and
13+
short timeouts, and treats "API not ready" responses (connection errors and transient
14+
5xx) as `WazuhNotReadyError` so `reconcile` fails fast to maintenance status instead of
15+
blocking. The charm now also re-runs `reconcile` on `update-status` and on the
16+
`wazuh-server` pebble check recovering, so deferred API configuration completes on a
17+
later event. The Wazuh API calls are now wrapped in OpenTelemetry spans for visibility.
18+
- Fix a performance issue in `reconcile`: `_reconcile_users` no longer retries API
19+
authentication with a one-second backoff on `WazuhAuthenticationError` (HTTP 401).
20+
A 401 is deterministic for a given password, so the retries could not succeed and
21+
only stalled reconcile by up to 5 seconds per user.
22+
923
## 2026-06-18
1024

1125
- Add distributed tracing support via `ops[tracing]`. Relates the charm to a Tempo-compatible

src/charm.py

Lines changed: 10 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -74,7 +74,9 @@ def __init__(self, *args: typing.Any):
7474

7575
self.framework.observe(self.on.install, self._on_install)
7676
self.framework.observe(self.on.wazuh_server_pebble_ready, self.reconcile)
77+
self.framework.observe(self.on.wazuh_server_pebble_check_recovered, self.reconcile)
7778
self.framework.observe(self.on.config_changed, self.reconcile)
79+
self.framework.observe(self.on.update_status, self.reconcile)
7880
self.framework.observe(self.on[WAZUH_PEER_RELATION_NAME].relation_joined, self.reconcile)
7981
self.framework.observe(self.on[WAZUH_PEER_RELATION_NAME].relation_changed, self.reconcile)
8082
self.framework.observe(self.on[WAZUH_PEER_RELATION_NAME].relation_departed, self.reconcile)
@@ -289,16 +291,17 @@ def _reconcile_users(self) -> None: # noqa: C901
289291
credentials = self.state.api_credentials
290292

291293
for username, user_details in state.WAZUH_USERS.items():
292-
# Try to authenticate with the current credentials. If it fails, password is invalid.
293-
retries = 5
294+
# Try to authenticate with the current credentials. A 401 (WazuhAuthenticationError)
295+
# is deterministic for a given password, so a single attempt is enough; retrying the
296+
# same credentials can never succeed and only stalls reconcile. Transient failures
297+
# (Wazuh not ready) surface as WazuhNotReadyError and propagate to the caller.
294298
valid = False
295-
while credentials[username] and not valid and retries > 0:
299+
if credentials[username]:
296300
try:
297301
wazuh.authenticate_user(username, credentials[username])
298302
valid = True
299303
except wazuh.WazuhAuthenticationError:
300-
retries -= 1
301-
time.sleep(1)
304+
valid = False
302305

303306
# Secret exists but users are the default. Force recreation.
304307
if not valid:
@@ -307,23 +310,15 @@ def _reconcile_users(self) -> None: # noqa: C901
307310
# create user if it doesn't exist yet
308311
current_password = credentials[username]
309312
password_to_save = None
310-
retries = 5
311-
while not current_password and retries > 0:
313+
if not current_password:
312314
try:
313315
token = wazuh.authenticate_user("wazuh", credentials["wazuh"])
314316
new_password = wazuh.generate_api_password()
315317
wazuh.create_api_user(username, new_password, token)
316318
current_password = new_password
317319
password_to_save = new_password
318320
except wazuh.WazuhAuthenticationError as exc:
319-
retries -= 1
320-
logger.error(
321-
"Failed to create user %s: %s. %s retries remaining.",
322-
username,
323-
exc,
324-
retries,
325-
)
326-
time.sleep(1)
321+
logger.error("Failed to create user %s: %s.", username, exc)
327322

328323
# change credentials if they've never been changed
329324
if current_password == user_details["default_password"]:

src/wazuh.py

Lines changed: 81 additions & 25 deletions
Original file line numberDiff line numberDiff line change
@@ -13,6 +13,7 @@
1313
from enum import Enum
1414
from pathlib import Path
1515

16+
import opentelemetry.trace
1617
import ops
1718
import requests
1819
import requests.adapters
@@ -28,6 +29,14 @@
2829
urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning)
2930
logging.getLogger("urllib3").setLevel(logging.WARNING)
3031

32+
tracer = opentelemetry.trace.get_tracer(__name__)
33+
34+
# Wazuh API request tuning. Keep retries tight and timeouts short so that, while the
35+
# Wazuh API is still starting up, calls fail fast (and surface as WazuhNotReadyError)
36+
# instead of blocking the reconcile hook for tens of seconds with exponential backoff.
37+
API_REQUEST_TIMEOUT = 5
38+
API_NOT_READY_STATUSES = frozenset({500, 502, 503, 504})
39+
3140
CONTAINER_NAME = "wazuh-server"
3241
INGEST_LOG_DIR = "/var/log/collectors/rsyslog" # logs intended for ingestion
3342
REPOSITORY_PATH = "/root/repository"
@@ -847,6 +856,28 @@ def _generate_cluster_snippet(
847856
"""
848857

849858

859+
def _api_session() -> requests.Session:
860+
"""Build a requests session with a tight retry policy for the Wazuh API.
861+
862+
The retries are intentionally minimal so that calls against a Wazuh API that is
863+
still starting up fail fast instead of blocking the reconcile hook for tens of
864+
seconds. Transient connection issues are surfaced by the callers as
865+
WazuhNotReadyError so the charm waits for a later event rather than erroring.
866+
867+
Returns: a configured requests Session.
868+
"""
869+
session = requests.Session()
870+
retries = requests.adapters.Retry(
871+
total=2,
872+
connect=2,
873+
read=2,
874+
backoff_factor=0.1,
875+
)
876+
session.mount("https://", requests.adapters.HTTPAdapter(max_retries=retries))
877+
return session
878+
879+
880+
@tracer.start_as_current_span("authenticate_user")
850881
def authenticate_user(username: str, password: str) -> str:
851882
"""Authenticate an API user.
852883
@@ -865,30 +896,36 @@ def authenticate_user(username: str, password: str) -> str:
865896
# certificates may be self-signed and there's no value in verifying them
866897
# as a compromised localhost service would indicate we're already compromised
867898
try:
868-
session = requests.Session()
869-
retries = requests.adapters.Retry(connect=10, backoff_factor=0.2, status_forcelist=[500])
870-
session.mount("https://", requests.adapters.HTTPAdapter(max_retries=retries))
871-
response = session.get( # nosec
899+
response = _api_session().get( # nosec
872900
AUTH_ENDPOINT,
873901
auth=(username, password),
874-
timeout=10,
902+
timeout=API_REQUEST_TIMEOUT,
875903
verify=False,
876904
)
877905
if response.status_code == 401:
878906
raise WazuhAuthenticationError(f"The provided password for {username} is not valid.")
907+
if response.status_code in API_NOT_READY_STATUSES:
908+
raise WazuhNotReadyError(
909+
f"Wazuh API not ready (status {response.status_code}) authenticating {username}."
910+
)
879911
response.raise_for_status()
880912
token = response.json()["data"]["token"] if response.json()["data"] else None
881913
if token is None:
882914
raise WazuhInstallationError(f"Response for {username} does not contain token.")
883915
logger.debug("Got Wazuh API auth token for username %s", username)
884916
return token
885-
except requests.exceptions.ConnectionError as exc:
886-
logger.warning("Wazuh API authentication failed: %s", exc)
917+
except (
918+
requests.exceptions.ConnectionError,
919+
requests.exceptions.Timeout,
920+
requests.exceptions.RetryError,
921+
) as exc:
922+
logger.warning("Wazuh API not ready while authenticating: %s", exc)
887923
raise WazuhNotReadyError from exc
888924
except requests.exceptions.RequestException as exc:
889925
raise WazuhInstallationError from exc
890926

891927

928+
@tracer.start_as_current_span("change_api_password")
892929
def change_api_password(username: str, password: str, token: str) -> None:
893930
"""Change Wazuh's API password for a given user.
894931
@@ -899,31 +936,40 @@ def change_api_password(username: str, password: str, token: str) -> None:
899936
900937
Raises:
901938
WazuhInstallationError: if an error occurs while processing the requests.
939+
WazuhNotReadyError: if wazuh is not yet ready to accept requests.
902940
"""
903941
# certificates may be self-signed and there's no value in verifying them
904942
# as a compromised localhost service would indicate we're already compromised
943+
session = _api_session()
905944
try:
906945
headers = {"Authorization": f"Bearer {token}"}
907-
response = requests.get( # nosec
946+
response = session.get( # nosec
908947
f"https://localhost:{API_PORT}/security/users",
909948
headers=headers,
910-
timeout=10,
911-
verify=False, # nosec # noqa: S501
949+
timeout=API_REQUEST_TIMEOUT,
950+
verify=False, # nosec
912951
)
913952
response.raise_for_status()
914953
data = response.json()["data"]
915954
user_id = next(
916955
user["id"] for user in data["affected_items"] if data and user["username"] == username
917956
)
918-
response = requests.put( # nosec
957+
response = session.put( # nosec
919958
f"https://localhost:{API_PORT}/security/users/{user_id}",
920959
headers=headers,
921960
json={"password": password},
922-
timeout=10,
923-
verify=False, # nosec # noqa: S501
961+
timeout=API_REQUEST_TIMEOUT,
962+
verify=False, # nosec
924963
)
925964
response.raise_for_status()
926965
logger.info("Changed API password for user %s", username)
966+
except (
967+
requests.exceptions.ConnectionError,
968+
requests.exceptions.Timeout,
969+
requests.exceptions.RetryError,
970+
) as exc:
971+
logger.warning("Wazuh API not ready while changing password: %s", exc)
972+
raise WazuhNotReadyError from exc
927973
except requests.exceptions.RequestException as exc:
928974
raise WazuhInstallationError("Error modifying the default password.") from exc
929975

@@ -946,6 +992,7 @@ def generate_api_password() -> str:
946992
return "".join(password)
947993

948994

995+
@tracer.start_as_current_span("create_api_user")
949996
def create_api_user(username: str, password: str, token: str, rolename: str = "readonly") -> None:
950997
"""Create a new readonly user for Wazuh's API.
951998
@@ -958,55 +1005,64 @@ def create_api_user(username: str, password: str, token: str, rolename: str = "r
9581005
Raises:
9591006
WazuhAuthenticationError: if a 401 error occurs while processing the requests.
9601007
WazuhInstallationError: if any non-401 error occurs while processing the requests.
1008+
WazuhNotReadyError: if wazuh is not yet ready to accept requests.
9611009
"""
9621010
# certificates may be self-signed and there's no value in verifying them
9631011
# as a compromised localhost service would indicate we're already compromised
9641012
response = None
1013+
session = _api_session()
9651014
try:
9661015
headers = {"Authorization": f"Bearer {token}"}
967-
response = requests.get( # nosec
1016+
response = session.get( # nosec
9681017
f"https://localhost:{API_PORT}/security/users",
9691018
headers=headers,
970-
timeout=10,
971-
verify=False, # nosec # noqa: S501
1019+
timeout=API_REQUEST_TIMEOUT,
1020+
verify=False, # nosec
9721021
)
9731022
response.raise_for_status()
9741023
data = response.json()["data"]
9751024
user_id = [
9761025
user["id"] for user in data["affected_items"] if data and user["username"] == username
9771026
]
9781027
if not user_id: # user has not been created yet
979-
response = requests.post( # nosec
1028+
response = session.post( # nosec
9801029
f"https://localhost:{API_PORT}/security/users",
9811030
headers=headers,
9821031
json={"username": username, "password": password},
983-
timeout=10,
984-
verify=False, # nosec # noqa: S501
1032+
timeout=API_REQUEST_TIMEOUT,
1033+
verify=False, # nosec
9851034
)
9861035
response.raise_for_status()
9871036
data = response.json()["data"]
9881037
user_id = next(
9891038
user["id"] for user in data["affected_items"] if data and user["username"] == username
9901039
)
991-
response = requests.get( # nosec
1040+
response = session.get( # nosec
9921041
f"https://localhost:{API_PORT}/security/roles",
9931042
headers=headers,
994-
timeout=10,
995-
verify=False, # nosec # noqa: S501
1043+
timeout=API_REQUEST_TIMEOUT,
1044+
verify=False, # nosec
9961045
)
9971046
response.raise_for_status()
9981047
data = response.json()["data"]
9991048
role_id = next(
10001049
role["id"] for role in data["affected_items"] if data and role["name"] == rolename
10011050
)
1002-
response = requests.post( # nosec
1051+
response = session.post( # nosec
10031052
f"https://localhost:{API_PORT}/security/users/{user_id}/roles?role_ids={role_id}",
10041053
headers=headers,
1005-
timeout=10,
1006-
verify=False, # nosec # noqa: S501
1054+
timeout=API_REQUEST_TIMEOUT,
1055+
verify=False, # nosec
10071056
)
10081057
response.raise_for_status()
10091058
logger.info("Created user %s", username)
1059+
except (
1060+
requests.exceptions.ConnectionError,
1061+
requests.exceptions.Timeout,
1062+
requests.exceptions.RetryError,
1063+
) as exc:
1064+
logger.warning("Wazuh API not ready while creating user: %s", exc)
1065+
raise WazuhNotReadyError from exc
10101066
except requests.exceptions.RequestException as exc:
10111067
if isinstance(response, requests.Response) and response.status_code == 401:
10121068
raise WazuhAuthenticationError("401 error creating an API user") from exc

0 commit comments

Comments
 (0)