Skip to content

Commit f605443

Browse files
authored
Merge pull request #12049 from petr-balogh/stabilize-rdr-deployment
Fix slow ops issue in rdr deployment
2 parents 572bca1 + 62944c8 commit f605443

File tree

2 files changed

+43
-3
lines changed

2 files changed

+43
-3
lines changed

ocs_ci/ocs/resources/storage_cluster.py

+16-3
Original file line numberDiff line numberDiff line change
@@ -24,6 +24,7 @@
2424
)
2525
from ocs_ci.ocs import constants, defaults, ocp, managedservice
2626
from ocs_ci.ocs.exceptions import (
27+
CephHealthRecoveredException,
2728
CommandFailed,
2829
ResourceNotFoundError,
2930
UnsupportedFeatureError,
@@ -728,12 +729,24 @@ def ocs_install_verification(
728729
# https://bugzilla.redhat.com/show_bug.cgi?id=1817727
729730
health_check_tries = 180
730731

732+
rdr_run = config.MULTICLUSTER.get("multicluster_mode") == "regional-dr"
733+
731734
# TODO: Enable the check when a solution is identified for tools pod on FaaS consumer
732735
if not (fusion_aas_consumer or hci_cluster):
733736
# Temporarily disable health check for hci until we have enough healthy clusters
734-
assert utils.ceph_health_check(
735-
namespace, health_check_tries, health_check_delay
736-
)
737+
try:
738+
assert utils.ceph_health_check(
739+
namespace,
740+
health_check_tries,
741+
health_check_delay,
742+
fix_ceph_health=rdr_run,
743+
)
744+
except CephHealthRecoveredException as ex:
745+
if rdr_run and "slow ops" in str(ex):
746+
# Related issue: https://github.com/red-hat-storage/ocs-ci/issues/11244
747+
log.warning("For RDR run we ignore slow ops error as it was recovered!")
748+
else:
749+
raise
737750
# Let's wait for storage system after ceph health is OK to prevent fails on
738751
# Progressing': 'True' state.
739752

ocs_ci/utility/utils.py

+27
Original file line numberDiff line numberDiff line change
@@ -2558,6 +2558,25 @@ def ceph_health_resolve_crash():
25582558
archive_ceph_crashes(ct_pod)
25592559

25602560

2561+
def ceph_health_resolve_mon_slow_ops(health_status):
2562+
"""
2563+
Fix ceph health issue with mon slow ops
2564+
"""
2565+
log.warning("Trying to fix the issue with mon slow ops by restarting mon pod")
2566+
mon_pattern = r"mon\.([a-z]) has slow ops"
2567+
match = re.search(mon_pattern, health_status)
2568+
mon_id = None
2569+
if match:
2570+
mon_id = match.group(1)
2571+
log.warning(f"Problematic MON ID with slow ops: {mon_id} will be restarted")
2572+
if mon_id:
2573+
from ocs_ci.ocs import ocp
2574+
2575+
ocp.OCP().exec_oc_cmd(
2576+
f"oc delete pod -n {config.ENV_DATA['cluster_namespace']} -l ceph_daemon_id={mon_id}"
2577+
)
2578+
2579+
25612580
def ceph_health_recover(health_status, namespace=None):
25622581
"""
25632582
Function which tries to recover ceph health to be HEALTH OK
@@ -2588,6 +2607,14 @@ def ceph_health_recover(health_status, namespace=None):
25882607
"ceph_health_tries": 5,
25892608
"ceph_health_delay": 30,
25902609
},
2610+
{
2611+
"pattern": r"slow ops, oldest one blocked for \d+ sec, mon\.([a-z]) has slow ops",
2612+
"func": ceph_health_resolve_mon_slow_ops,
2613+
"func_args": [health_status],
2614+
"func_kwargs": {},
2615+
"ceph_health_tries": 6,
2616+
"ceph_health_delay": 30,
2617+
},
25912618
# TODO: Add more patterns and fix functions
25922619
]
25932620
for fix_dict in ceph_health_fixes:

0 commit comments

Comments
 (0)