Skip to content

Commit 499758b

Browse files
authored
Merge pull request #12072 from petr-balogh/stabilize-rdr-deployment-cherry-pick-4.17
Fix slow ops issue in rdr deployment
2 parents 2cf4342 + 9b79d64 commit 499758b

File tree

2 files changed

+43
-3
lines changed

2 files changed

+43
-3
lines changed

ocs_ci/ocs/resources/storage_cluster.py

Lines changed: 16 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -24,6 +24,7 @@
2424
)
2525
from ocs_ci.ocs import constants, defaults, ocp, managedservice
2626
from ocs_ci.ocs.exceptions import (
27+
CephHealthRecoveredException,
2728
CommandFailed,
2829
ResourceNotFoundError,
2930
UnsupportedFeatureError,
@@ -705,12 +706,24 @@ def ocs_install_verification(
705706
# https://bugzilla.redhat.com/show_bug.cgi?id=1817727
706707
health_check_tries = 180
707708

709+
rdr_run = config.MULTICLUSTER.get("multicluster_mode") == "regional-dr"
710+
708711
# TODO: Enable the check when a solution is identified for tools pod on FaaS consumer
709712
if not (fusion_aas_consumer or hci_cluster):
710713
# Temporarily disable health check for hci until we have enough healthy clusters
711-
assert utils.ceph_health_check(
712-
namespace, health_check_tries, health_check_delay
713-
)
714+
try:
715+
assert utils.ceph_health_check(
716+
namespace,
717+
health_check_tries,
718+
health_check_delay,
719+
fix_ceph_health=rdr_run,
720+
)
721+
except CephHealthRecoveredException as ex:
722+
if rdr_run and "slow ops" in str(ex):
723+
# Related issue: https://github.com/red-hat-storage/ocs-ci/issues/11244
724+
log.warning("For RDR run we ignore slow ops error as it was recovered!")
725+
else:
726+
raise
714727
# Let's wait for storage system after ceph health is OK to prevent fails on
715728
# Progressing': 'True' state.
716729

ocs_ci/utility/utils.py

Lines changed: 27 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -2504,6 +2504,25 @@ def ceph_health_resolve_daemon_crash():
25042504
archive_ceph_crashes(ct_pod)
25052505

25062506

2507+
def ceph_health_resolve_mon_slow_ops(health_status):
2508+
"""
2509+
Fix ceph health issue with mon slow ops
2510+
"""
2511+
log.warning("Trying to fix the issue with mon slow ops by restarting mon pod")
2512+
mon_pattern = r"mon\.([a-z]) has slow ops"
2513+
match = re.search(mon_pattern, health_status)
2514+
mon_id = None
2515+
if match:
2516+
mon_id = match.group(1)
2517+
log.warning(f"Problematic MON ID with slow ops: {mon_id} will be restarted")
2518+
if mon_id:
2519+
from ocs_ci.ocs import ocp
2520+
2521+
ocp.OCP().exec_oc_cmd(
2522+
f"oc delete pod -n {config.ENV_DATA['cluster_namespace']} -l ceph_daemon_id={mon_id}"
2523+
)
2524+
2525+
25072526
def ceph_health_recover(health_status, namespace=None):
25082527
"""
25092528
Function which tries to recover ceph health to be HEALTH OK
@@ -2526,6 +2545,14 @@ def ceph_health_recover(health_status, namespace=None):
25262545
"ceph_health_tries": 5,
25272546
"ceph_health_delay": 30,
25282547
},
2548+
{
2549+
"pattern": r"slow ops, oldest one blocked for \d+ sec, mon\.([a-z]) has slow ops",
2550+
"func": ceph_health_resolve_mon_slow_ops,
2551+
"func_args": [health_status],
2552+
"func_kwargs": {},
2553+
"ceph_health_tries": 6,
2554+
"ceph_health_delay": 30,
2555+
},
25292556
# TODO: Add more patterns and fix functions
25302557
]
25312558
for fix_dict in ceph_health_fixes:

0 commit comments

Comments
 (0)