Skip to content

Commit 9a86dd9

Browse files
authored
Resiliency test automation when platform under stress (#11925)
Signed-off-by: Parag Kamble <[email protected]>
1 parent 21bb40b commit 9a86dd9

File tree

10 files changed

+592
-54
lines changed

10 files changed

+592
-54
lines changed

ocs_ci/ocs/ocp.py

+8-2
Original file line numberDiff line numberDiff line change
@@ -238,7 +238,9 @@ def exec_oc_cmd(
238238
config.switch_ctx(original_context)
239239
return out
240240

241-
def exec_oc_debug_cmd(self, node, cmd_list, timeout=300, namespace=None):
241+
def exec_oc_debug_cmd(
242+
self, node, cmd_list, timeout=300, namespace=None, use_root=True
243+
):
242244
"""
243245
Function to execute "oc debug" command on OCP node
244246
@@ -258,11 +260,15 @@ def exec_oc_debug_cmd(self, node, cmd_list, timeout=300, namespace=None):
258260
create_cmd_list = copy.deepcopy(cmd_list)
259261
create_cmd_list.append(" ")
260262
err_msg = "CMD FAILED"
263+
if use_root:
264+
root_option = " chroot /host /bin/bash -c "
265+
else:
266+
root_option = " /bin/bash -c "
261267
cmd = f" || echo '{err_msg}';".join(create_cmd_list)
262268
namespace = namespace or config.ENV_DATA["cluster_namespace"]
263269
debug_cmd = (
264270
f"debug nodes/{node} --to-namespace={namespace} "
265-
f' -- chroot /host /bin/bash -c "{cmd}"'
271+
f' -- {root_option} "{cmd}"'
266272
)
267273
out = str(
268274
self.exec_oc_cmd(command=debug_cmd, out_yaml_format=False, timeout=timeout)

ocs_ci/ocs/resources/pod.py

+42
Original file line numberDiff line numberDiff line change
@@ -4119,3 +4119,45 @@ def get_pods_pvcs(pod_objs, namespace=None):
41194119
namespace = namespace or config.ENV_DATA["cluster_namespace"]
41204120
pvc_names = [get_pvc_name(p) for p in pod_objs]
41214121
return get_pvc_objs(pvc_names, namespace)
4122+
4123+
4124+
def delete_pod_by_phase(
4125+
pod_phase,
4126+
namespace=config.ENV_DATA["cluster_namespace"],
4127+
):
4128+
"""
4129+
Delete the pods in a specific phase
4130+
Args:
4131+
pod_status (str): The pod status to delete
4132+
namespace (str): Name of cluster namespace(default: config.ENV_DATA["cluster_namespace"])
4133+
Returns:
4134+
bool: True, if the pods deleted successfully. False, otherwise
4135+
"""
4136+
logger.info(f"Delete all the pods in the status '{pod_phase}'")
4137+
if pod_phase.lower() == "succeeded":
4138+
phase = "Succeeded"
4139+
elif pod_phase.lower() == "failed":
4140+
phase = "Failed"
4141+
elif pod_phase.lower() == "Pending":
4142+
phase = "Pending"
4143+
elif pod_phase.lower() == "running":
4144+
phase = "Running"
4145+
else:
4146+
raise ValueError(
4147+
f"Invalid pod status '{pod_phase}'. "
4148+
f"Valid options are 'succeeded' or 'failed'"
4149+
)
4150+
4151+
cmd = f"oc delete pod --field-selector=status.phase={phase} -n {namespace}"
4152+
logger.info(cmd)
4153+
try:
4154+
run_cmd(cmd=cmd)
4155+
except CommandFailed as ex:
4156+
logger.warning(
4157+
f"Failed to delete the pods in the status '{pod_phase}' due to the error: {ex}"
4158+
)
4159+
return False
4160+
4161+
logger.info(f"All '{pod_phase}' pods deleted successfully.")
4162+
4163+
return True

ocs_ci/resiliency/network_faults.py

+4-4
Original file line numberDiff line numberDiff line change
@@ -80,7 +80,7 @@ def _get_all_node_network_interfaces(self, interface_types):
8080
iface.strip() for iface in output.splitlines() if iface.strip()
8181
]
8282
interfaces.extend(ifaces)
83-
except CommandFailed as e:
83+
except (CommandFailed, subprocess.TimeoutExpired) as e:
8484
log.error(f"Error retrieving interfaces from node {node.name}: {e}")
8585
continue
8686

@@ -160,7 +160,7 @@ def _apply_fault(self, description, netem_command):
160160
cmd = f"tc qdisc del dev {iface} root"
161161
try:
162162
self.ocp_obj.exec_oc_debug_cmd(node=node.name, cmd_list=[cmd])
163-
except CommandFailed as e:
163+
except (CommandFailed, subprocess.TimeoutExpired) as e:
164164
log.error(
165165
f"Failed to remove fault from {node.name}/{iface}: {e}"
166166
)
@@ -188,7 +188,7 @@ def _remove_faults_all_nodes(self):
188188
try:
189189
self.ocp_obj.exec_oc_debug_cmd(node=node.name, cmd_list=[cmd_del])
190190
log.debug(f"Deleted qdisc on {node.name}/{iface}")
191-
except CommandFailed as e:
191+
except (CommandFailed, subprocess.TimeoutExpired) as e:
192192
log.warning(f"Could not delete qdisc on {node.name}/{iface}: {e}")
193193
continue
194194

@@ -206,7 +206,7 @@ def _remove_faults_all_nodes(self):
206206
log.info(
207207
f"Verified: netem successfully removed from {node.name}/{iface}"
208208
)
209-
except CommandFailed as e:
209+
except (CommandFailed, subprocess.TimeoutExpired) as e:
210210
log.warning(
211211
f"Could not verify qdisc status on {node.name}/{iface}: {e}"
212212
)

ocs_ci/resiliency/node_stats.py

+106-34
Original file line numberDiff line numberDiff line change
@@ -13,27 +13,63 @@ class NodeStats:
1313
"""
1414

1515
@staticmethod
16-
def cpu_stats(node_obj):
16+
def cpu_stats(node_obj, interval=1, count=2, format="json"):
1717
"""
1818
Get CPU statistics for a given node using `mpstat`.
1919
2020
Args:
21-
node_obj (OCSNode): Node object to query.
21+
node_obj (OCSNode): The node object to fetch stats from.
22+
interval (int): Interval in seconds between samples. Default 1.
23+
count (int): Number of samples to take. Default 2.
24+
format (str): Output format - "json" or "text". Default "json".
2225
2326
Returns:
24-
list: List of CPU stats dictionaries parsed from mpstat JSON output.
27+
list or str or None: Parsed statistics list (JSON), raw output (text),
28+
or None on failure.
2529
"""
26-
ocp_obj = ocp.OCP(kind="node")
27-
cmd = f"debug nodes/{node_obj.name} -- mpstat 1 2 -o JSON"
30+
log.info(
31+
f"Running mpstat on node '{node_obj.name}' with interval={interval}, count={count}, format={format}"
32+
)
33+
34+
if format not in ("json", "text"):
35+
log.error(f"Unsupported format '{format}'. Use 'json' or 'text'.")
36+
return None
2837

38+
cmd = f"mpstat {interval} {count}"
39+
if format == "json":
40+
cmd += " -o JSON"
41+
elif format == "text":
42+
cmd += " > /tmp/mpstat.txt && cat /tmp/mpstat.txt"
43+
log.warning("Text format selected. Output will be raw and unparsed.")
44+
45+
ocp_obj = ocp.OCP(kind="node")
2946
try:
30-
log.info(f"Running mpstat on node: {node_obj.name}")
31-
cmd_output = ocp_obj.exec_oc_cmd(command=cmd, out_yaml_format=False)
32-
output = json.loads(cmd_output)
33-
return output.get("sysstat", {}).get("hosts", [{}])[0].get("statistics", [])
47+
cmd_output = ocp_obj.exec_oc_debug_cmd(
48+
node=node_obj.name, cmd_list=[cmd], use_root=False
49+
)
50+
51+
if not cmd_output:
52+
log.warning("Empty response received from mpstat command")
53+
return None
54+
55+
if format == "json":
56+
try:
57+
output = json.loads(cmd_output)
58+
return (
59+
output.get("sysstat", {})
60+
.get("hosts", [{}])[0]
61+
.get("statistics", [])
62+
)
63+
except json.JSONDecodeError as e:
64+
log.error(
65+
f"Failed to parse JSON from mpstat on node '{node_obj.name}': {e}"
66+
)
67+
return None
68+
return cmd_output.splitlines()
69+
3470
except CommandFailed as e:
35-
log.error(f"Failed to fetch CPU stats for node {node_obj.name}: {e}")
36-
return []
71+
log.error(f"Failed to fetch CPU stats from node '{node_obj.name}': {e}")
72+
return None
3773

3874
@staticmethod
3975
def memory_usage_percent(node_obj):
@@ -47,15 +83,20 @@ def memory_usage_percent(node_obj):
4783
float: Used memory percentage.
4884
"""
4985
ocp_obj = ocp.OCP(kind="node")
50-
cmd = f"debug nodes/{node_obj.name} -- cat /proc/meminfo"
86+
cmd = "cat /proc/meminfo"
5187

5288
try:
53-
output = ocp_obj.exec_oc_cmd(command=cmd, out_yaml_format=False)
89+
output = ocp_obj.exec_oc_debug_cmd(
90+
node=node_obj.name, cmd_list=[cmd], use_root=False, timeout=30
91+
)
5492

5593
meminfo = {}
5694
for line in output.splitlines():
57-
key, value = line.strip().split(":", 1)
58-
meminfo[key] = int(value.strip().split()[0]) # in KB
95+
try:
96+
key, value = line.strip().split(":", 1)
97+
meminfo[key] = int(value.strip().split()[0]) # in KB
98+
except (ValueError, IndexError):
99+
continue
59100

60101
mem_total = meminfo.get("MemTotal")
61102
mem_available = meminfo.get("MemAvailable")
@@ -64,57 +105,88 @@ def memory_usage_percent(node_obj):
64105
used_percent = ((mem_total - mem_available) / mem_total) * 100
65106
log.info(f"Memory usage on node {node_obj.name}: {used_percent:.2f}%")
66107
return round(used_percent, 2)
67-
else:
68-
log.warning("Missing MemTotal or MemAvailable in /proc/meminfo")
69-
return 0.0
108+
109+
log.warning("Missing MemTotal or MemAvailable in /proc/meminfo")
110+
return 0.0
70111

71112
except CommandFailed as e:
72-
log.error(f"Failed to compute memory usage on node {node_obj.name}: {e}")
113+
log.error(
114+
f"Failed to compute memory usage on node {node_obj.name}: {str(e)}"
115+
)
73116
return 0.0
74117

75118
@staticmethod
76-
def disk_stats(node_obj):
119+
def disk_stats(node_obj, format="json", interval=1, count=2):
77120
"""
78121
Get disk I/O statistics using `iostat`.
79122
80123
Args:
81124
node_obj (OCSNode): Node object to query.
125+
format (str): Output format ("json" or "text"). Default "json".
126+
interval (int): Interval in seconds between samples. Default 1.
127+
count (int): Number of samples to take. Default 2.
82128
83129
Returns:
84-
dict: Latest disk statistics.
130+
dict or list: Latest disk statistics (dict for JSON, list for text).
85131
"""
86132
ocp_obj = ocp.OCP(kind="node")
87-
cmd = f"debug nodes/{node_obj.name} -- iostat -xt -o JSON 1 2"
133+
if format not in ("json", "text"):
134+
log.error(f"Unsupported format '{format}'. Use 'json' or 'text'.")
135+
return {}
136+
137+
cmd = f"iostat -xt {interval} {count}"
138+
if format == "json":
139+
cmd += " -o JSON"
88140

89141
try:
90-
log.info(f"Running disk stats command on node: {node_obj.name}")
91-
cmd_output = ocp_obj.exec_oc_cmd(command=cmd, out_yaml_format=False)
92-
output = json.loads(cmd_output)
93-
stats = (
94-
output.get("sysstat", {}).get("hosts", [{}])[0].get("statistics", [])
142+
output = ocp_obj.exec_oc_debug_cmd(
143+
node=node_obj.name, cmd_list=[cmd], use_root=False, timeout=30
95144
)
96-
return stats[-1] if stats else {}
145+
146+
if format == "json":
147+
try:
148+
output = json.loads(output)
149+
stats = (
150+
output.get("sysstat", {})
151+
.get("hosts", [{}])[0]
152+
.get("statistics", [])
153+
)
154+
return stats[-1] if stats else {}
155+
except json.JSONDecodeError as e:
156+
log.error(
157+
f"Failed to parse JSON from iostat on node '{node_obj.name}': {e}"
158+
)
159+
return {}
160+
161+
return output.splitlines()
162+
97163
except CommandFailed as e:
98-
log.error(f"Failed to fetch disk stats for node {node_obj.name}: {e}")
164+
log.error(f"Failed to fetch disk stats from node '{node_obj.name}': {e}")
99165
return {}
100166

101167
@staticmethod
102-
def network_stats(node_obj, interface="ovn-k8s-mp0"):
168+
def network_stats(node_obj, interface="ovn-k8s-mp0", interval=1, count=2):
103169
"""
104170
Get network interface statistics using `sar`.
105171
106172
Args:
107173
node_obj (OCSNode): Node object to query.
174+
interface (str): Network interface to monitor. Default "ovn-k8s-mp0".
175+
interval (int): Interval in seconds between samples. Default 1.
176+
count (int): Number of samples to take. Default 2.
108177
109178
Returns:
110-
str: Network interface statistics as text.
179+
list: Network interface statistics as text lines.
111180
"""
112181
ocp_obj = ocp.OCP(kind="node")
113-
cmd = f"debug nodes/{node_obj.name} -- sar -n DEV 1 1"
182+
cmd = f"sar -n DEV {interval} {count}"
114183

115184
try:
185+
output = ocp_obj.exec_oc_debug_cmd(
186+
node=node_obj.name, cmd_list=[cmd], use_root=False
187+
)
116188
log.info(f"Running network stats command on node: {node_obj.name}")
117-
return ocp_obj.exec_oc_cmd(command=cmd, out_yaml_format=False)
189+
return output.splitlines()
118190
except CommandFailed as e:
119191
log.error(f"Failed to fetch network stats for node {node_obj.name}: {e}")
120-
return ""
192+
return []

ocs_ci/resiliency/platform_failures.py

+2-1
Original file line numberDiff line numberDiff line change
@@ -17,6 +17,7 @@
1717

1818
import logging
1919
import random
20+
import subprocess
2021

2122
from ocs_ci.ocs.platform_nodes import PlatformNodesFactory
2223
from ocs_ci.resiliency.network_faults import NetworkFaults
@@ -148,7 +149,7 @@ def run_fault_simulation(nodes, interfaces, label):
148149
nf = NetworkFaults(nodes, interface_types=interfaces)
149150
nf.run()
150151
log.info(f"[{label}] Completed fault simulation.")
151-
except (ValueError, CommandFailed) as e:
152+
except (ValueError, CommandFailed, subprocess.TimeoutExpired) as e:
152153
log.error(f"[{label}] Error during simulation: {e}")
153154

154155
# Helper to get a random subset of any list

0 commit comments

Comments
 (0)