@@ -2558,6 +2558,25 @@ def ceph_health_resolve_crash():
2558
2558
archive_ceph_crashes (ct_pod )
2559
2559
2560
2560
2561
+ def ceph_health_resolve_mon_slow_ops (health_status ):
2562
+ """
2563
+ Fix ceph health issue with mon slow ops
2564
+ """
2565
+ log .warning ("Trying to fix the issue with mon slow ops by restarting mon pod" )
2566
+ mon_pattern = r"mon\.([a-z]) has slow ops"
2567
+ match = re .search (mon_pattern , health_status )
2568
+ mon_id = None
2569
+ if match :
2570
+ mon_id = match .group (1 )
2571
+ log .warning (f"Problematic MON ID with slow ops: { mon_id } will be restarted" )
2572
+ if mon_id :
2573
+ from ocs_ci .ocs import ocp
2574
+
2575
+ ocp .OCP ().exec_oc_cmd (
2576
+ f"oc delete pod -n { config .ENV_DATA ['cluster_namespace' ]} -l ceph_daemon_id={ mon_id } "
2577
+ )
2578
+
2579
+
2561
2580
def ceph_health_recover (health_status , namespace = None ):
2562
2581
"""
2563
2582
Function which tries to recover ceph health to be HEALTH OK
@@ -2588,6 +2607,14 @@ def ceph_health_recover(health_status, namespace=None):
2588
2607
"ceph_health_tries" : 5 ,
2589
2608
"ceph_health_delay" : 30 ,
2590
2609
},
2610
+ {
2611
+ "pattern" : r"slow ops, oldest one blocked for \d+ sec, mon\.([a-z]) has slow ops" ,
2612
+ "func" : ceph_health_resolve_mon_slow_ops ,
2613
+ "func_args" : [health_status ],
2614
+ "func_kwargs" : {},
2615
+ "ceph_health_tries" : 6 ,
2616
+ "ceph_health_delay" : 30 ,
2617
+ },
2591
2618
# TODO: Add more patterns and fix functions
2592
2619
]
2593
2620
for fix_dict in ceph_health_fixes :
0 commit comments