Skip to content

Commit d13d900

Browse files
committed
Add ping check logic
Change-Id: I26ac9205f14d7fb053f6ed4da640664dac4621fd
1 parent 090ebb1 commit d13d900

File tree

2 files changed

+386
-0
lines changed

2 files changed

+386
-0
lines changed

app/check_ping.py

Lines changed: 162 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,162 @@
1+
import logging
2+
import re
3+
4+
from kubernetes import client
5+
from pydantic import BaseModel, Field
6+
7+
from distributed_runner import DistributedRunner
8+
9+
log = logging.getLogger("check.ping")
10+
11+
default_image = "alpine:3.22"
12+
13+
14+
class CheckPingParameters(BaseModel):
15+
namespace: str = "cluster-health"
16+
image: str = default_image
17+
secondary_network_config: str = Field(None, alias="secondaryNetworkConfig")
18+
timeout_seconds: int = Field(300, alias="timeoutSeconds")
19+
count: int = 1
20+
avg_rtt_ms_threshold: float = Field(None, alias="avgRttMsThreshold")
21+
max_rtt_ms_threshold: float = Field(None, alias="maxRttMsThreshold")
22+
ping_target: str = Field(alias="pingTarget")
23+
24+
25+
class CheckPing:
26+
"""
27+
Runs a network ping test on each node in the cluster.
28+
The check creates a Pod that runs on each node one at a time. Each pod
29+
executes 'ping', and the results are read from the pod logs.
30+
The packet loss and round-trip times are then checked. The Pod is
31+
cleaned up after the check.
32+
"""
33+
34+
def __init__(self, parameters: dict) -> None:
35+
params = CheckPingParameters(**parameters)
36+
self.namespace = params.namespace
37+
self.secondary_network_config = params.secondary_network_config
38+
self.image = params.image
39+
self.count = params.count
40+
self.timeout_seconds = params.timeout_seconds
41+
self.avg_rtt_ms_threshold = params.avg_rtt_ms_threshold
42+
self.max_rtt_ms_threshold = params.max_rtt_ms_threshold
43+
self.ping_target = params.ping_target
44+
self.k8s_core_v1 = client.CoreV1Api()
45+
self.k8s_apps_v1 = client.AppsV1Api()
46+
47+
def parse_pod_logs_from_ping_test_func(self) -> callable:
48+
"""
49+
Creates a parsing function that reads log output from each pod and assesses health check pass/fail.
50+
"""
51+
count = self.count
52+
avg_rtt_ms_threshold = self.avg_rtt_ms_threshold
53+
max_rtt_ms_threshold = self.max_rtt_ms_threshold
54+
55+
def parse_pod_logs_from_ping_test(logs: str, node_name: str) -> bool:
56+
# Check for 0% packet loss
57+
packet_loss_match = re.search(r"(\d+)% packet loss", logs)
58+
if not packet_loss_match:
59+
log.error(
60+
f"Could not parse packet loss from ping results on node {node_name}. Logs:\n{logs}"
61+
)
62+
return False
63+
64+
packet_loss = int(packet_loss_match.group(1))
65+
if packet_loss > 0:
66+
log.error(f"Node {node_name}: {packet_loss}% packet loss detected.")
67+
return False
68+
69+
# Check if the correct number of packets were transmitted and received
70+
transmitted_match = re.search(r"(\d+) packets transmitted", logs)
71+
if not transmitted_match:
72+
log.error(
73+
f"Could not parse transmitted packets on node {node_name}. Logs:\n{logs}"
74+
)
75+
return False
76+
77+
transmitted = int(transmitted_match.group(1))
78+
if transmitted != count:
79+
log.error(
80+
f"Node {node_name}: Expected {count} packets transmitted, but got {transmitted}."
81+
)
82+
return False
83+
84+
# If RTT checks are configured, parse and validate them.
85+
if avg_rtt_ms_threshold is not None or max_rtt_ms_threshold is not None:
86+
# Regex for iputils-ping: rtt min/avg/max/mdev = 0.052/0.052/0.052/0.000 ms
87+
rtt_match = re.search(
88+
r"rtt min/avg/max/mdev = [\d\.]+/([\d\.]+)/([\d\.]+)/[\d\.]+ ms", logs
89+
)
90+
if not rtt_match:
91+
# Fallback for busybox ping: round-trip min/avg/max = 0.052/0.052/0.052 ms
92+
rtt_match = re.search(
93+
r"round-trip min/avg/max = [\d\.]+/([\d\.]+)/([\d\.]+) ms", logs
94+
)
95+
96+
if not rtt_match:
97+
log.error(
98+
f"Could not parse RTT from ping results on node {node_name} but RTT checks were requested. Logs:\n{logs}"
99+
)
100+
return False
101+
102+
rtt_avg = float(rtt_match.group(1))
103+
rtt_max = float(rtt_match.group(2))
104+
log.info(
105+
f"Node {node_name}: Ping RTT avg={rtt_avg:.2f}ms, max={rtt_max:.2f}ms"
106+
)
107+
108+
if avg_rtt_ms_threshold is not None and rtt_avg > avg_rtt_ms_threshold:
109+
log.error(
110+
f"Node {node_name} average RTT {rtt_avg:.2f}ms is above maximum of {avg_rtt_ms_threshold}ms."
111+
)
112+
return False
113+
114+
if max_rtt_ms_threshold is not None and rtt_max > max_rtt_ms_threshold:
115+
log.error(
116+
f"Node {node_name} max RTT {rtt_max:.2f}ms is above maximum of {max_rtt_ms_threshold}ms."
117+
)
118+
return False
119+
120+
log.info(f"Node {node_name}: Ping test successful with 0% packet loss.")
121+
return True
122+
123+
return parse_pod_logs_from_ping_test
124+
125+
def is_healthy(self):
126+
command_args = ["ping", "-c", str(self.count), self.ping_target]
127+
128+
container = client.V1Container(
129+
name="ping-container",
130+
image=self.image,
131+
command=command_args,
132+
)
133+
134+
pod_spec = client.V1PodSpec(
135+
containers=[container],
136+
restart_policy="Never",
137+
termination_grace_period_seconds=5,
138+
)
139+
140+
metadata = client.V1ObjectMeta(
141+
labels={"app": "ping-test"},
142+
)
143+
144+
if self.secondary_network_config:
145+
metadata.annotations = {
146+
"networking.gke.io/interfaces": self.secondary_network_config
147+
}
148+
149+
template = client.V1PodTemplateSpec(
150+
metadata=metadata,
151+
spec=pod_spec,
152+
)
153+
154+
runner = DistributedRunner(
155+
pod_name_prefix="ping",
156+
namespace=self.namespace,
157+
pod_template=template,
158+
timeout_seconds=self.timeout_seconds,
159+
log_parse_function=self.parse_pod_logs_from_ping_test_func(),
160+
)
161+
162+
return runner.run_on_all_nodes()

app/test_check_ping.py

Lines changed: 224 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,224 @@
1+
import unittest
2+
from unittest.mock import MagicMock, patch
3+
4+
from check_ping import CheckPing
5+
from pydantic import ValidationError
6+
7+
8+
class TestCheckPing(unittest.TestCase):
9+
def setUp(self):
10+
# Patch the DistributedRunner to avoid actual k8s calls
11+
self.mock_distributed_runner_patch = patch("check_ping.DistributedRunner")
12+
self.MockDistributedRunner = self.mock_distributed_runner_patch.start()
13+
self.mock_runner_instance = MagicMock()
14+
self.MockDistributedRunner.return_value = self.mock_runner_instance
15+
16+
self.valid_params = {
17+
"pingTarget": "8.8.8.8",
18+
}
19+
20+
def tearDown(self):
21+
self.mock_distributed_runner_patch.stop()
22+
23+
def test_init_with_minimal_parameters(self):
24+
"""Tests that CheckPing initializes correctly with minimal valid parameters."""
25+
check = CheckPing(self.valid_params)
26+
self.assertEqual(check.namespace, "cluster-health")
27+
self.assertEqual(check.image, "alpine:3.22")
28+
self.assertEqual(check.count, 1)
29+
self.assertEqual(check.timeout_seconds, 300)
30+
self.assertEqual(check.ping_target, "8.8.8.8")
31+
self.assertIsNone(check.secondary_network_config)
32+
self.assertIsNone(check.avg_rtt_ms_threshold)
33+
self.assertIsNone(check.max_rtt_ms_threshold)
34+
35+
def test_init_with_custom_parameters(self):
36+
"""Tests initialization with custom parameters."""
37+
params = {
38+
"namespace": "custom-ns",
39+
"image": "my-custom/ping:latest",
40+
"secondaryNetworkConfig": "my-net-config",
41+
"timeoutSeconds": 120,
42+
"count": 5,
43+
"avgRttMsThreshold": 50.0,
44+
"maxRttMsThreshold": 100.0,
45+
"pingTarget": "1.1.1.1",
46+
}
47+
check = CheckPing(params)
48+
self.assertEqual(check.namespace, "custom-ns")
49+
self.assertEqual(check.image, "my-custom/ping:latest")
50+
self.assertEqual(check.secondary_network_config, "my-net-config")
51+
self.assertEqual(check.timeout_seconds, 120)
52+
self.assertEqual(check.count, 5)
53+
self.assertEqual(check.avg_rtt_ms_threshold, 50.0)
54+
self.assertEqual(check.max_rtt_ms_threshold, 100.0)
55+
self.assertEqual(check.ping_target, "1.1.1.1")
56+
57+
def test_init_with_missing_required_parameters(self):
58+
"""Tests that initialization fails if required parameters are missing."""
59+
with self.assertRaises(ValidationError):
60+
CheckPing({"namespace": "test"}) # Missing pingTarget
61+
62+
# --- Log Parsing Tests ---
63+
64+
def test_parse_pod_logs_success_no_rtt_check(self):
65+
"""Tests the log parsing function with successful ping results and no RTT checks."""
66+
check = CheckPing(self.valid_params)
67+
log_parser = check.parse_pod_logs_from_ping_test_func()
68+
logs = """
69+
PING 8.8.8.8 (8.8.8.8): 56 data bytes
70+
64 bytes from 8.8.8.8: seq=0 ttl=118 time=10.5 ms
71+
72+
--- 8.8.8.8 ping statistics ---
73+
1 packets transmitted, 1 packets received, 0% packet loss
74+
"""
75+
self.assertTrue(log_parser(logs, "node-1"))
76+
77+
def test_parse_pod_logs_success_with_rtt_check_iputils(self):
78+
"""Tests log parsing with successful RTTs (iputils-ping format)."""
79+
params = self.valid_params.copy()
80+
params["avgRttMsThreshold"] = 20.0
81+
params["maxRttMsThreshold"] = 30.0
82+
check = CheckPing(params)
83+
log_parser = check.parse_pod_logs_from_ping_test_func()
84+
logs = """
85+
PING 8.8.8.8 (8.8.8.8) 56(84) bytes of data.
86+
64 bytes from 8.8.8.8: icmp_seq=1 ttl=118 time=15.1 ms
87+
88+
--- 8.8.8.8 ping statistics ---
89+
1 packets transmitted, 1 received, 0% packet loss, time 0ms
90+
rtt min/avg/max/mdev = 15.100/15.100/15.100/0.000 ms
91+
"""
92+
self.assertTrue(log_parser(logs, "node-1"))
93+
94+
def test_parse_pod_logs_success_with_rtt_check_busybox(self):
95+
"""Tests log parsing with successful RTTs (busybox ping format)."""
96+
params = self.valid_params.copy()
97+
params["avgRttMsThreshold"] = 20.0
98+
params["maxRttMsThreshold"] = 30.0
99+
check = CheckPing(params)
100+
log_parser = check.parse_pod_logs_from_ping_test_func()
101+
logs = """
102+
PING 8.8.8.8 (8.8.8.8): 56 data bytes
103+
64 bytes from 8.8.8.8: seq=0 ttl=118 time=15.1 ms
104+
105+
--- 8.8.8.8 ping statistics ---
106+
1 packets transmitted, 1 packets received, 0% packet loss
107+
round-trip min/avg/max = 15.100/15.100/15.100 ms
108+
"""
109+
self.assertTrue(log_parser(logs, "node-1"))
110+
111+
def test_parse_pod_logs_packet_loss_fail(self):
112+
"""Tests log parsing when packet loss is detected."""
113+
check = CheckPing(self.valid_params)
114+
log_parser = check.parse_pod_logs_from_ping_test_func()
115+
logs = "2 packets transmitted, 1 packets received, 50% packet loss"
116+
self.assertFalse(log_parser(logs, "node-1"))
117+
118+
def test_parse_pod_logs_packet_loss_parse_error(self):
119+
"""Tests log parsing with malformed packet loss data."""
120+
check = CheckPing(self.valid_params)
121+
log_parser = check.parse_pod_logs_from_ping_test_func()
122+
logs = "Some unexpected output without packet loss information."
123+
self.assertFalse(log_parser(logs, "node-1"))
124+
125+
def test_parse_pod_logs_wrong_packet_count(self):
126+
"""Tests log parsing when the transmitted packet count is wrong."""
127+
params = self.valid_params.copy()
128+
params["count"] = 5
129+
check = CheckPing(params)
130+
log_parser = check.parse_pod_logs_from_ping_test_func()
131+
logs = "1 packets transmitted, 1 received, 0% packet loss"
132+
self.assertFalse(log_parser(logs, "node-1"))
133+
134+
def test_parse_pod_logs_avg_rtt_fail(self):
135+
"""Tests log parsing when average RTT is above the threshold."""
136+
params = self.valid_params.copy()
137+
params["avgRttMsThreshold"] = 10.0
138+
check = CheckPing(params)
139+
log_parser = check.parse_pod_logs_from_ping_test_func()
140+
logs = """
141+
1 packets transmitted, 1 received, 0% packet loss
142+
rtt min/avg/max/mdev = 15.0/15.0/15.0/0.0 ms
143+
"""
144+
self.assertFalse(log_parser(logs, "node-1"))
145+
146+
def test_parse_pod_logs_max_rtt_fail(self):
147+
"""Tests log parsing when max RTT is above the threshold."""
148+
params = self.valid_params.copy()
149+
params["maxRttMsThreshold"] = 20.0
150+
check = CheckPing(params)
151+
log_parser = check.parse_pod_logs_from_ping_test_func()
152+
logs = """
153+
1 packets transmitted, 1 received, 0% packet loss
154+
rtt min/avg/max/mdev = 15.0/18.0/25.0/2.0 ms
155+
"""
156+
self.assertFalse(log_parser(logs, "node-1"))
157+
158+
def test_parse_pod_logs_rtt_parse_error(self):
159+
"""Tests log parsing with malformed RTT data when RTT check is enabled."""
160+
params = self.valid_params.copy()
161+
params["avgRttMsThreshold"] = 10.0
162+
check = CheckPing(params)
163+
log_parser = check.parse_pod_logs_from_ping_test_func()
164+
logs = "1 packets transmitted, 1 received, 0% packet loss. No RTT data."
165+
self.assertFalse(log_parser(logs, "node-1"))
166+
167+
# --- is_healthy Tests ---
168+
169+
def test_is_healthy_success(self):
170+
"""Tests is_healthy when the distributed run is successful."""
171+
self.mock_runner_instance.run_on_all_nodes.return_value = True
172+
check = CheckPing(self.valid_params)
173+
self.assertTrue(check.is_healthy())
174+
self.mock_runner_instance.run_on_all_nodes.assert_called_once()
175+
176+
def test_is_healthy_failure(self):
177+
"""Tests is_healthy when the distributed run fails."""
178+
self.mock_runner_instance.run_on_all_nodes.return_value = False
179+
check = CheckPing(self.valid_params)
180+
self.assertFalse(check.is_healthy())
181+
self.mock_runner_instance.run_on_all_nodes.assert_called_once()
182+
183+
def test_is_healthy_constructs_runner_correctly(self):
184+
"""Tests that DistributedRunner is constructed with the correct pod template."""
185+
params = {
186+
"pingTarget": "8.8.8.8",
187+
"count": 5,
188+
"namespace": "ping-ns",
189+
"timeoutSeconds": 99,
190+
}
191+
check = CheckPing(params)
192+
check.is_healthy()
193+
194+
self.MockDistributedRunner.assert_called_once()
195+
_, kwargs = self.MockDistributedRunner.call_args
196+
197+
self.assertEqual(kwargs["pod_name_prefix"], "ping")
198+
self.assertEqual(kwargs["namespace"], "ping-ns")
199+
self.assertEqual(kwargs["timeout_seconds"], 99)
200+
self.assertIsNotNone(kwargs["log_parse_function"])
201+
202+
pod_template = kwargs["pod_template"]
203+
container = pod_template.spec.containers[0]
204+
self.assertEqual(container.image, "alpine:3.22")
205+
self.assertEqual(container.command, ["ping", "-c", "5", "8.8.8.8"])
206+
self.assertIsNone(pod_template.metadata.annotations)
207+
208+
def test_is_healthy_with_secondary_network(self):
209+
"""Tests that the pod template includes secondary network annotations."""
210+
params = self.valid_params.copy()
211+
params["secondaryNetworkConfig"] = '{"name": "net1"}'
212+
check = CheckPing(params)
213+
check.is_healthy()
214+
215+
_, kwargs = self.MockDistributedRunner.call_args
216+
pod_template = kwargs["pod_template"]
217+
self.assertEqual(
218+
pod_template.metadata.annotations,
219+
{"networking.gke.io/interfaces": '{"name": "net1"}'},
220+
)
221+
222+
223+
if __name__ == "__main__":
224+
unittest.main()

0 commit comments

Comments
 (0)