Skip to content

Commit 11f5c07

Browse files
akram09akram09
andauthored
[DPE-7002] Update opensearch health checks (#215)
## Description of issue or feature: This PR addresses the issue mentioned in #177. After initial discussion, it was decided that the opensearch-dashboard charm should stay in a blocked status if the opensearch charm is not healthy or unreachable. This is due to the tight coupling between the two charms. The user is therefore asked to make interventions if the opensearch charm is not in a healthy state. At the current state, we don't make difference between opensearch being sel-healed or restarting and the case of opensearch not working correctly. Therefore this PR makes improvement over the opensearch health checks. ## Solution: - Remove the `app_healthy` function and make calls directly to opensearch_ok to avoid confusions. - Add tenacity to the health checks. - Log the exception happening when json decode error occurs. - Update charm status message when opensearch is in a yellow state. ## How was this change tested? - [X] Manually - [X] Unit tests - [X] Integration tests --------- Co-authored-by: akram09 <akram.boutouchent@eurecom.fr>
1 parent a7dc967 commit 11f5c07

6 files changed

Lines changed: 71 additions & 25 deletions

File tree

src/charm.py

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -210,9 +210,9 @@ def reconcile(self, event: EventBase) -> None:
210210

211211
# Regular health-check
212212
# Checks that may modify the 'app' state as well
213-
app_healthy, app_msg = self.health_manager.app_healthy()
214-
if not app_healthy:
215-
set_global_status(self, BlockedStatus(app_msg))
213+
opensearch_healthy, opensearch_msg = self.health_manager.opensearch_ok()
214+
if not opensearch_healthy:
215+
set_global_status(self, BlockedStatus(opensearch_msg))
216216
return
217217
else:
218218
outdated_status += MSG_APP_STATUS

src/literals.py

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -52,6 +52,7 @@
5252
MSG_WAITING_FOR_PEER = "waiting for peer relation"
5353
MSG_STATUS_DB_MISSING = "Opensearch connection is missing"
5454
MSG_STATUS_DB_DOWN = "Opensearch service is (partially or fully) down"
55+
MSG_STATUS_DB_UNHEALTHY = "The OpenSearch service health is red"
5556
MSG_TLS_CONFIG = "Waiting for TLS to be fully configured..."
5657
MSG_INCOMPATIBLE_UPGRADE = "Incompatible Opensearch and Dashboards versions"
5758

@@ -65,6 +66,7 @@
6566

6667
MSG_APP_STATUS = [
6768
MSG_STATUS_DB_DOWN,
69+
MSG_STATUS_DB_UNHEALTHY,
6870
]
6971

7072
MSG_UNIT_STATUS = [

src/managers/health.py

Lines changed: 34 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -9,6 +9,8 @@
99

1010
import requests
1111
from requests.exceptions import ConnectionError, HTTPError
12+
from tenacity import Retrying, stop_after_attempt, wait_fixed, RetryCallState
13+
import urllib3
1214

1315
from core.cluster import SUBSTRATES, ClusterState
1416
from core.workload import WorkloadBase
@@ -18,6 +20,7 @@
1820
MSG_STATUS_APP_REMOVED,
1921
MSG_STATUS_DB_DOWN,
2022
MSG_STATUS_DB_MISSING,
23+
MSG_STATUS_DB_UNHEALTHY,
2124
MSG_STATUS_ERROR,
2225
MSG_STATUS_HANGING,
2326
MSG_STATUS_UNAVAIL,
@@ -78,41 +81,59 @@ def opensearch_ok(self) -> tuple[bool, str]:
7881
):
7982
return False, MSG_STATUS_DB_MISSING
8083

84+
def log_retry(retry_state: RetryCallState) -> None:
85+
"""Log retry attempts."""
86+
logger.debug(
87+
f"Retrying... Attempt {retry_state.attempt_number}"
88+
f"\tException: {retry_state.outcome.exception()}"
89+
)
90+
8191
for endpoint in self.state.opensearch_server.endpoints:
8292
full_url = f"https://{endpoint}/{HEALTH_OPENSEARCH_STATUS_URL}"
8393

8494
request_kwargs = {
8595
"url": full_url,
8696
"method": "GET",
8797
"verify": self.workload.paths.opensearch_ca,
88-
"headers": None,
98+
"headers": {
99+
"Content-Type": "application/json",
100+
"Accept": "application/json",
101+
},
89102
"timeout": REQUEST_TIMEOUT,
90103
}
91104

92-
try:
93-
with requests.Session() as s:
105+
with requests.Session() as s:
106+
try:
94107
s.auth = ( # type: ignore [reportAttributeAccessIssue]
95108
self.state.opensearch_server.username,
96109
self.state.opensearch_server.password,
97110
)
98-
resp = s.request(**request_kwargs)
99-
resp.raise_for_status()
100-
except requests.exceptions.RequestException:
101-
continue
111+
for attempt in Retrying(
112+
stop=stop_after_attempt(3),
113+
wait=wait_fixed(1),
114+
reraise=True,
115+
before_sleep=log_retry,
116+
):
117+
with attempt:
118+
resp = s.request(**request_kwargs)
119+
resp.raise_for_status()
120+
except (requests.RequestException, urllib3.exceptions.HTTPError):
121+
logger.error(f"Failed to connect to {full_url}")
122+
continue
102123

103124
if resp.status_code == 200:
104125
try:
105126
status = resp.json()
106127
except requests.exceptions.JSONDecodeError:
128+
logger.error(f"Failed to decode JSON from {full_url}")
107129
continue
108-
if status.get("status") == "green":
109-
return True, ""
110130

111-
return False, MSG_STATUS_DB_DOWN
131+
if status.get("status") == "red":
132+
return False, MSG_STATUS_DB_UNHEALTHY
112133

113-
def app_healthy(self) -> tuple[bool, str]:
114-
"""Unit-level global healthcheck."""
115-
return self.opensearch_ok()
134+
if status.get("status") in {"green", "yellow"}:
135+
return True, ""
136+
return False, MSG_STATUS_DB_DOWN
116137

117138
def unit_healthy(self) -> tuple[bool, str]:
118139
"""Unit-level global healthcheck."""

tests/integration/application-charm/src/charm.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -35,7 +35,7 @@ def __init__(self, *args):
3535
self.framework.observe(self.on.update_status, self._on_update_status)
3636

3737
# `albums` index is used in integration test
38-
self.opensearch = OpenSearchRequires(self, "opensearch-client", "albums", "")
38+
self.opensearch = OpenSearchRequires(self, "opensearch-client", "albums", "admin")
3939

4040
self.framework.observe(self.opensearch.on.index_created, self._on_authentication_updated)
4141
self.framework.observe(

tests/integration/test_charm.py

Lines changed: 25 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -11,6 +11,8 @@
1111
import yaml
1212
from pytest_operator.plugin import OpsTest
1313

14+
from literals import MSG_STATUS_DB_UNHEALTHY
15+
1416
from .helpers import (
1517
CONFIG_OPTS,
1618
DASHBOARD_QUERY_PARAMS,
@@ -316,20 +318,37 @@ async def test_dashboard_status_changes(ops_test: OpsTest):
316318
opensearch_relation = get_relations(ops_test, OPENSEARCH_RELATION_NAME)[0]
317319
assert await access_all_dashboards(ops_test, opensearch_relation.id, https=True)
318320

319-
logger.info("Removing an opensearch unit so Opensearch gets in a 'red' state")
320-
await ops_test.model.applications[APP_NAME].destroy_unit(
321-
ops_test.model.applications[OPENSEARCH_APP_NAME].units[1].name
321+
logger.info(
322+
"Adding a new index with shards allocated to a non existent node to make the cluster health red"
322323
)
323-
await ops_test.model.applications[APP_NAME].destroy_unit(
324-
ops_test.model.applications[OPENSEARCH_APP_NAME].units[0].name
324+
client_relation = get_relations(ops_test, OPENSEARCH_RELATION_NAME, DB_CLIENT_APP_NAME)[0]
325+
326+
payload = {
327+
"settings": {
328+
"index.routing.allocation.require._name": "non_existent_node",
329+
"index.number_of_shards": 5,
330+
"index.number_of_replicas": 0,
331+
}
332+
}
333+
334+
payload = json.dumps(payload)
335+
336+
unit_name = ops_test.model.applications[DB_CLIENT_APP_NAME].units[0].name
337+
await client_run_db_request(
338+
ops_test,
339+
unit_name,
340+
client_relation,
341+
"PUT",
342+
"/bad_index",
343+
re.escape(payload),
325344
)
326345
async with ops_test.fast_forward("30s"):
327346
await ops_test.model.wait_for_idle(apps=[APP_NAME], status="blocked")
328347

329348
assert await check_full_status(
330349
ops_test,
331350
status="blocked",
332-
status_msg="Opensearch service is (partially or fully) down",
351+
status_msg=MSG_STATUS_DB_UNHEALTHY,
333352
)
334353

335354

tests/unit/test_health.py

Lines changed: 6 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -20,7 +20,7 @@
2020
OPENSEARCH_REL_NAME,
2121
SUBSTRATE,
2222
)
23-
from src.literals import MSG_STATUS_DB_DOWN, MSG_STATUS_HANGING
23+
from src.literals import MSG_STATUS_DB_DOWN, MSG_STATUS_HANGING, MSG_STATUS_DB_UNHEALTHY
2424
from tests.unit.test_charm import MSG_STATUS_UNHEALTHY
2525

2626
logger = logging.getLogger(__name__)
@@ -192,7 +192,11 @@ def test_health_opensearch_not_ok(harness, status):
192192
)
193193

194194
with patch("os.path.exists", return_value=True), patch("os.path.getsize", return_value=1):
195-
assert (False, MSG_STATUS_DB_DOWN) == harness.charm.health_manager.opensearch_ok()
195+
# We should make a distinction between unhealthy and down
196+
if status == "red":
197+
assert (False, MSG_STATUS_DB_UNHEALTHY) == harness.charm.health_manager.opensearch_ok()
198+
else:
199+
assert (True, "") == harness.charm.health_manager.opensearch_ok()
196200

197201

198202
@responses.activate

0 commit comments

Comments
 (0)