Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
14 changes: 12 additions & 2 deletions tests/topotests/grpc_basic/test_basic_grpc.py
Original file line number Diff line number Diff line change
Expand Up @@ -108,7 +108,17 @@ def test_capabilities(tgen):
logging.debug("grpc output: %s", output)

modules = sorted(re.findall('name: "([^"]+)"', output))
expected = ["frr-backend", "frr-host", "frr-interface", "frr-logging", "frr-routing", "frr-staticd", "frr-vrf", "ietf-srv6-types", "ietf-syslog-types"]
expected = [
"frr-backend",
"frr-host",
"frr-interface",
"frr-logging",
"frr-routing",
"frr-staticd",
"frr-vrf",
"ietf-srv6-types",
"ietf-syslog-types",
]
assert modules == expected

encodings = sorted(re.findall("supported_encodings: (.*)", output))
Expand Down Expand Up @@ -203,7 +213,7 @@ def test_shutdown_checks(tgen):
time.sleep(1)
try:
for r in tgen.routers().values():
r.net.stopRouter(False)
r.net.stopRouter()
r.net.checkRouterCores()
finally:
if p:
Expand Down
78 changes: 60 additions & 18 deletions tests/topotests/lib/topogen.py
Original file line number Diff line number Diff line change
Expand Up @@ -35,17 +35,20 @@
import pwd
import re
import shlex
import signal
import subprocess
import sys
import time
from collections import OrderedDict

import lib.topolog as topolog
from lib.micronet import Commander
from lib.micronet_compat import Mininet
from lib.topolog import logger
from munet.base import Timeout
from munet.testing.util import pause_test

from lib import topotest
from . import topotest

CWD = os.path.dirname(os.path.realpath(__file__))

Expand Down Expand Up @@ -466,11 +469,16 @@ def start_router(self, router=None):

def stop_topology(self):
"""
Stops the network topology. This function will call the stop() function
of all gears before calling the mininet stop function, so they can have
their oportunity to do a graceful shutdown. stop() is called twice. The
first is a simple kill with no sleep, the second will sleep if not
killed and try with a different signal.
Stops the network topology. This function will:
1. Send SIGTERM to all daemons on all routers in parallel
2. Wait for all daemons to stop together
3. Force stop any remaining daemons with SIGBUS
4. Stop all non-router gears (hosts, ExaBGP, etc.)
5. Collect errors from all routers
6. Stop the mininet network

This parallel approach reduces test completion time compared to
stopping routers sequentially.
"""
pause = bool(self.net.cfgopt.get_option("--pause-at-end"))
pause = pause or bool(self.net.cfgopt.get_option("--pause"))
Expand All @@ -484,13 +492,43 @@ def stop_topology(self):

logger.info("stopping topology: {}".format(self.modname))

# Step 1: Send SIGTERM to all daemons on all routers in parallel (non-blocking)
routers = {x.net for x in self.gears.values() if isinstance(x, TopoRouter)}
for router in routers:
router.stopRouterSignalDaemons()

# Step 2: Wait for all daemons to stop (check all routers in a single loop)
wait_count = 0
still_running = {x: dt for x in routers if (dt := x.listDaemons())}
for rem in Timeout(30):
if not any(still_running):
break
if wait_count % 5 == 0: # Log every 2.5 seconds
desc = " ".join([f"{r.name}: {d}" for r, d in still_running.items()])
logger.info(f"[%s]: waiting for daemons to stop: %s", rem, desc)
time.sleep(0.5)
wait_count += 1
still_running = {x: dt for x in still_running if (dt := x.listDaemons())}

# Step 3: Force stop any remaining daemons with SIGBUS
for router in still_running:
router.stopRouterSignalDaemons(signal.SIGBUS, remove_pidfile=True)

# Wait 5 seconds for cores
if any(still_running):
time.sleep(5)

# Step 4: Stop all non-routers.
hosts = {x for x in self.gears.values() if not isinstance(x, TopoRouter)}
for host in hosts:
host.stop()

# Step 5: Collect errors from all routers (cores, etc.)
errors = ""
for gear in self.gears.values():
errors += gear.stop()
if len(errors) > 0:
logger.error(
"Errors found post shutdown - details follow: {}".format(errors)
)
for router in routers:
errors += router.checkRouterCores(reportOnce=True)
if errors:
logger.error("Errors found post shutdown: %s", errors)

try:
self.net.stop()
Expand Down Expand Up @@ -615,7 +653,7 @@ def start(self):
"Basic start function that just reports equipment start"
logger.info('starting "{}"'.format(self.name))

def stop(self, wait=True, assertOnError=True):
def stop(self, wait=True):
"Basic stop function that just reports equipment stop"
logger.info('"{}" base stop called'.format(self.name))
return ""
Expand Down Expand Up @@ -952,13 +990,13 @@ def start(self):

return result

def stop(self):
def stop(self, wait=True):
"""
Stop router cleanly:
* Signal daemons twice, once with SIGTERM, then with SIGKILL.
"""
self.logger.debug("stopping (no assert)")
return self.net.stopRouter(False)
self.logger.debug("stopping")
return self.net.stopRouter(wait=wait)

def startDaemons(self, daemons):
"""
Expand Down Expand Up @@ -1175,6 +1213,7 @@ def __str__(self):

class TopoHost(TopoGear):
"Host abstraction."

# pylint: disable=too-few-public-methods

def __init__(self, tgen, name, **params):
Expand Down Expand Up @@ -1213,6 +1252,7 @@ def __str__(self):

class TopoExaBGP(TopoHost):
"ExaBGP peer abstraction."

# pylint: disable=too-few-public-methods

PRIVATE_DIRS = [
Expand Down Expand Up @@ -1281,8 +1321,9 @@ def start(self, peer_dir, env_file=None):

logger.info("{} exabgp started, output={}".format(self.name, output))

def stop(self, wait=True, assertOnError=True):
def stop(self, wait=True):
"Stop ExaBGP peer and kill the daemon"
logger.debug("stopping exabgp")
self.run("kill `cat /var/run/exabgp/exabgp.pid`")
return ""

Expand Down Expand Up @@ -1321,7 +1362,8 @@ def start(self, log_file=None):
stderr=err,
)

def stop(self):
def stop(self, wait=True):
logger.debug("stopping bmp collector")
self.run(f"kill $(cat {self.pid_file}")
return ""

Expand Down
78 changes: 42 additions & 36 deletions tests/topotests/lib/topotest.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,7 @@
import functools
import glob
import json
import logging
import os
import platform
import re
Expand All @@ -24,15 +25,14 @@
import sys
import tempfile
import time
import logging
from collections.abc import Mapping
from copy import deepcopy
from pathlib import Path

import lib.topolog as topolog
from lib.micronet_compat import Node
from lib.topolog import logger
from munet.base import commander, get_exec_path_host, Timeout
from munet.base import Timeout, commander, get_exec_path_host
from munet.testing.util import retry

from lib import micronet
Expand Down Expand Up @@ -1611,21 +1611,37 @@ def listDaemons(self):
pass
return ret

def stopRouter(self, assertOnError=True):
# Stop Running FRR Daemons
def stopRouterSignalDaemons(self, sig=signal.SIGTERM, remove_pidfile=False):
stopped = []
running = self.listDaemons()
if not running:
return ""

logger.info("%s: stopping %s", self.name, ", ".join([x[0] for x in running]))
if running and remove_pidfile:
logger.warning(
"%s: force stopping %s", self.name, ", ".join([x[0] for x in running])
)
elif running:
logger.info(
"%s: stopping %s", self.name, ", ".join([x[0] for x in running])
)
for name, pid in running:
logger.debug("{}: sending SIGTERM to {}".format(self.name, name))
logger.debug("%s: sending %s to %s", self.name, signal.strsignal(sig), name)
try:
os.kill(pid, signal.SIGTERM)
# Probably better to use cmd_raises to support non-local process
os.kill(pid, sig)
stopped.append(name)
except OSError as err:
logger.debug(
"%s: could not kill %s (%s): %s", self.name, name, pid, str(err)
)
if remove_pidfile:
self.cmd(f"rm -f /var/run/{self.routertype}/{name}.pid")
return stopped

def stopRouter(self, wait=True):
# Stop Running FRR Daemons
stopped = self.stopRouterSignalDaemons()

if not wait:
return

running = self.listDaemons()
if running:
Expand All @@ -1641,26 +1657,10 @@ def stopRouter(self, assertOnError=True):
break

if running:
logger.warning(
"%s: sending SIGBUS to: %s",
self.name,
", ".join([x[0] for x in running]),
)
for name, pid in running:
pidfile = "/var/run/{}/{}.pid".format(self.routertype, name)
logger.info("%s: killing %s", self.name, name)
self.cmd("kill -SIGBUS %d" % pid)
self.cmd("rm -- " + pidfile)

sleep(
0.5,
"%s: waiting for daemons to exit/core after initial SIGBUS" % self.name,
)
stopped = self.stopRouterSignalDaemons(signal.SIGBUS, True)
sleep(1, f"{self.name}: waiting for core files after SIGBUS")

errors = self.checkRouterCores(reportOnce=True)
if assertOnError and (errors is not None) and len(errors) > 0:
assert "Errors found - details follow:" == 0, errors
return errors
return self.checkRouterCores(reportOnce=True)

def removeIPs(self):
for interface in self.intfNames():
Expand Down Expand Up @@ -1872,20 +1872,23 @@ def startRouter(self, tgen=None):
self.run_in_window("vtysh", title="vt-%s" % self.name)

if self.unified_config:

# Check that none of the datastores are locked before proceeding
def check_datastores_unlocked():
"""Check that all datastores are unlocked"""
try:
logger.info("Checking datastores on router %s", self.name)
logger.debug("Checking datastores on router %s", self.name)
output = self.cmd("vtysh -c 'show mgmt datastore all'")
# Check if any datastore is locked
for line in output.splitlines():
logger.info("Line: %s", line)
logger.debug("Line: %s", line)
if "Locked:" in line and "True" in line:
logger.info("Datastore is locked on router %s", self.name)
logger.warning(
"Datastore is locked on router %s: %s",
self.name,
output,
)
return False
logger.info("Datastores are unlocked on router %s", self.name)
logger.debug("Datastores are unlocked on router %s", self.name)
return True
except Exception:
# If command fails, assume datastores are unlocked
Expand Down Expand Up @@ -1940,15 +1943,18 @@ def startRouterDaemons(self, daemons=None, tgen=None):
# Get global bundle data
if not self.path_exists("/etc/frr/support_bundle_commands.conf"):
logger.info(
"No support bundle commands.conf found in %s namespace, copying them over", self.name
"No support bundle commands.conf found in %s namespace, copying them over",
self.name,
)
# Copy global value if was covered by namespace mount
bundle_data = ""
if os.path.exists("/etc/frr/support_bundle_commands.conf"):
with open("/etc/frr/support_bundle_commands.conf", "r") as rf:
bundle_data = rf.read()
else:
logger.warning("No support bundle commands.conf found, please install them on this system")
logger.warning(
"No support bundle commands.conf found, please install them on this system"
)
self.cmd_raises(
"cat > /etc/frr/support_bundle_commands.conf",
stdin=bundle_data,
Expand Down
Loading