Merge pull request #12049 from petr-balogh/stabilize-rdr-deployment

petr-balogh · web-flow · commit f605443b924b · 2025-05-06T10:06:22.000+02:00
Fix slow ops issue in rdr deployment
diff --git a/ocs_ci/ocs/resources/storage_cluster.py b/ocs_ci/ocs/resources/storage_cluster.py
@@ -24,6 +24,7 @@
 )
 from ocs_ci.ocs import constants, defaults, ocp, managedservice
 from ocs_ci.ocs.exceptions import (
+    CephHealthRecoveredException,
     CommandFailed,
     ResourceNotFoundError,
     UnsupportedFeatureError,
@@ -728,12 +729,24 @@ def ocs_install_verification(
         # https://bugzilla.redhat.com/show_bug.cgi?id=1817727
         health_check_tries = 180
 
+    rdr_run = config.MULTICLUSTER.get("multicluster_mode") == "regional-dr"
+
     # TODO: Enable the check when a solution is identified for tools pod on FaaS consumer
     if not (fusion_aas_consumer or hci_cluster):
         # Temporarily disable health check for hci until we have enough healthy clusters
-        assert utils.ceph_health_check(
-            namespace, health_check_tries, health_check_delay
-        )
+        try:
+            assert utils.ceph_health_check(
+                namespace,
+                health_check_tries,
+                health_check_delay,
+                fix_ceph_health=rdr_run,
+            )
+        except CephHealthRecoveredException as ex:
+            if rdr_run and "slow ops" in str(ex):
+                # Related issue: https://github.com/red-hat-storage/ocs-ci/issues/11244
+                log.warning("For RDR run we ignore slow ops error as it was recovered!")
+            else:
+                raise
     # Let's wait for storage system after ceph health is OK to prevent fails on
     # Progressing': 'True' state.
 
diff --git a/ocs_ci/utility/utils.py b/ocs_ci/utility/utils.py
@@ -2558,6 +2558,25 @@ def ceph_health_resolve_crash():
     archive_ceph_crashes(ct_pod)
 
 
+def ceph_health_resolve_mon_slow_ops(health_status):
+    """
+    Fix ceph health issue with mon slow ops
+    """
+    log.warning("Trying to fix the issue with mon slow ops by restarting mon pod")
+    mon_pattern = r"mon\.([a-z]) has slow ops"
+    match = re.search(mon_pattern, health_status)
+    mon_id = None
+    if match:
+        mon_id = match.group(1)
+        log.warning(f"Problematic MON ID with slow ops: {mon_id} will be restarted")
+    if mon_id:
+        from ocs_ci.ocs import ocp
+
+        ocp.OCP().exec_oc_cmd(
+            f"oc delete pod -n {config.ENV_DATA['cluster_namespace']} -l ceph_daemon_id={mon_id}"
+        )
+
+
 def ceph_health_recover(health_status, namespace=None):
     """
     Function which tries to recover ceph health to be HEALTH OK
@@ -2588,6 +2607,14 @@ def ceph_health_recover(health_status, namespace=None):
             "ceph_health_tries": 5,
             "ceph_health_delay": 30,
         },
+        {
+            "pattern": r"slow ops, oldest one blocked for \d+ sec, mon\.([a-z]) has slow ops",
+            "func": ceph_health_resolve_mon_slow_ops,
+            "func_args": [health_status],
+            "func_kwargs": {},
+            "ceph_health_tries": 6,
+            "ceph_health_delay": 30,
+        },
         # TODO: Add more patterns and fix functions
     ]
     for fix_dict in ceph_health_fixes: