Skip to content

Chassisd does not wait for the execution to complete for previous admin state change requests #3845

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Open
wants to merge 61 commits into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
61 commits
Select commit Hold shift + click to select a range
df597bd
Fixed issue #22138
rameshraghupathy Apr 11, 2025
1ac4199
Resolved SA errors
rameshraghupathy Apr 11, 2025
c111cfc
Adding coverage
rameshraghupathy Apr 14, 2025
0df04c8
Adding coverage
rameshraghupathy Apr 14, 2025
f42edea
Adding coverage
rameshraghupathy Apr 14, 2025
2b5b87f
Adding coverage
rameshraghupathy Apr 14, 2025
051753c
Adding coverage
rameshraghupathy Apr 14, 2025
32fad32
Adding coverage
rameshraghupathy Apr 14, 2025
fc9338c
Adding coverage
rameshraghupathy Apr 15, 2025
7ef8186
Adding coverage
rameshraghupathy Apr 15, 2025
ed35477
Adding coverage
rameshraghupathy Apr 15, 2025
7ee216d
Adding coverage
rameshraghupathy Apr 15, 2025
5a27245
Adding coverage
rameshraghupathy Apr 15, 2025
1d96263
Adding coverage
rameshraghupathy Apr 15, 2025
0245039
Adding coverage
rameshraghupathy Apr 15, 2025
628bdeb
Adding coverage
rameshraghupathy Apr 15, 2025
bb1f45c
Adding coverage
rameshraghupathy Apr 15, 2025
8a03b9c
Adding coverage
rameshraghupathy Apr 15, 2025
0bf4d12
Adding coverage
rameshraghupathy Apr 15, 2025
9655381
Adding coverage
rameshraghupathy Apr 15, 2025
9f5d102
Adding coverage
rameshraghupathy Apr 15, 2025
f137859
Adding coverage
rameshraghupathy Apr 15, 2025
be28cdf
Adding coverage
rameshraghupathy Apr 15, 2025
094b751
Adding coverage
rameshraghupathy Apr 15, 2025
86ec6b1
Adding coverage
rameshraghupathy Apr 15, 2025
f20a880
Adding coverage
rameshraghupathy Apr 15, 2025
2e2c79e
Adding coverage
rameshraghupathy Apr 15, 2025
3e8e5d5
Adding coverage
rameshraghupathy Apr 15, 2025
c980649
Adding coverage
rameshraghupathy Apr 15, 2025
a92d5e3
Adding debug log
rameshraghupathy Apr 15, 2025
fb6b52b
Adding debug to test
rameshraghupathy Apr 15, 2025
5651ba8
Adding debug to test
rameshraghupathy Apr 15, 2025
fe3eedf
Adding debug to test
rameshraghupathy Apr 15, 2025
cb0e5d4
Adding debug to test
rameshraghupathy Apr 15, 2025
b7ee989
Adding debug to test
rameshraghupathy Apr 15, 2025
c3cbebe
Adding debug to test
rameshraghupathy Apr 15, 2025
36f8216
Adding debug to test
rameshraghupathy Apr 15, 2025
05d1d86
Adding debug to test
rameshraghupathy Apr 15, 2025
788137c
Adding debug to test
rameshraghupathy Apr 15, 2025
3b6410b
Adding debug to test
rameshraghupathy Apr 15, 2025
4996292
Adding debug to test
rameshraghupathy Apr 15, 2025
82cd145
Adding debug to test
rameshraghupathy Apr 16, 2025
e6a2de5
Adding debug to test
rameshraghupathy Apr 16, 2025
d86eb5b
Adding debug to test
rameshraghupathy Apr 16, 2025
51ccd66
Adding debug to test
rameshraghupathy Apr 16, 2025
e749259
Adding debug to test
rameshraghupathy Apr 16, 2025
b0a1fde
Adding debug to test
rameshraghupathy Apr 16, 2025
d100fae
Adding debug to test
rameshraghupathy Apr 16, 2025
e908c12
Adding debug to test
rameshraghupathy Apr 16, 2025
1dbe136
Adding debug to test
rameshraghupathy Apr 16, 2025
b7a31ba
Adding debug to test
rameshraghupathy Apr 16, 2025
0fd638d
fixing overage
rameshraghupathy Apr 19, 2025
916c585
Addressed a review comments
rameshraghupathy Apr 29, 2025
579a55f
Fixed SA issues
rameshraghupathy Apr 30, 2025
0ba6b90
Fixing UT
rameshraghupathy Apr 30, 2025
250730f
Fixing UT
rameshraghupathy Apr 30, 2025
a2a1cd5
Fixing UT issues
rameshraghupathy May 1, 2025
6988eda
Improving coverage
rameshraghupathy May 1, 2025
27c0b10
Improving coverage
rameshraghupathy May 1, 2025
6790820
Improving coverage
rameshraghupathy May 1, 2025
b981373
Improving coverage
rameshraghupathy May 1, 2025
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
115 changes: 101 additions & 14 deletions config/chassis_modules.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,10 +6,27 @@
import subprocess
import utilities_common.cli as clicommon
from utilities_common.chassis import is_smartswitch, get_all_dpus
from datetime import datetime, timedelta

TIMEOUT_SECS = 10
TRANSITION_TIMEOUT = timedelta(seconds=240) # 4 minutes


class StateDBHelper:
def __init__(self, sonic_db):
self.db = sonic_db

def get_entry(self, table, key):
"""Fetch all fields from table|key."""
redis_key = f"{table}|{key}"
return self.db.get_all("STATE_DB", redis_key) or {}

def set_entry(self, table, key, entry):
"""Set multiple fields to table|key."""
redis_key = f"{table}|{key}"
for field, value in entry.items():
self.db.set("STATE_DB", redis_key, field, value)

#
# 'chassis_modules' group ('config chassis_modules ...')
#
Expand All @@ -24,6 +41,12 @@ def modules():
pass


def ensure_statedb_connected(db):
if not hasattr(db, 'statedb'):
chassisdb = db.db
chassisdb.connect("STATE_DB")
db.statedb = StateDBHelper(chassisdb)

def get_config_module_state(db, chassis_module_name):
config_db = db.cfgdb
fvs = config_db.get_entry('CHASSIS_MODULE', chassis_module_name)
Expand All @@ -36,6 +59,40 @@ def get_config_module_state(db, chassis_module_name):
return fvs['admin_status']


def get_state_transition_in_progress(db, chassis_module_name):
ensure_statedb_connected(db)
fvs = db.statedb.get_entry('CHASSIS_MODULE_TABLE', chassis_module_name)
value = fvs.get('state_transition_in_progress', 'False') if fvs else 'False'
return value


def set_state_transition_in_progress(db, chassis_module_name, value):
ensure_statedb_connected(db)
state_db = db.statedb
entry = state_db.get_entry('CHASSIS_MODULE_TABLE', chassis_module_name) or {}
entry['state_transition_in_progress'] = value
if value == 'True':
entry['transition_start_time'] = datetime.utcnow().isoformat()
else:
entry.pop('transition_start_time', None)
state_db.set_entry('CHASSIS_MODULE_TABLE', chassis_module_name, entry)


def is_transition_timed_out(db, chassis_module_name):
ensure_statedb_connected(db)
state_db = db.statedb
fvs = state_db.get_entry('CHASSIS_MODULE_TABLE', chassis_module_name)
if not fvs:
return False
start_time_str = fvs.get('transition_start_time')
if not start_time_str:
return False
try:
start_time = datetime.fromisoformat(start_time_str)
except ValueError:
return False
return datetime.utcnow() - start_time > TRANSITION_TIMEOUT

#
# Name: check_config_module_state_with_timeout
# return: True: timeout, False: not timeout
Expand Down Expand Up @@ -116,20 +173,34 @@ def shutdown_chassis_module(db, chassis_module_name):
config_db = db.cfgdb
ctx = click.get_current_context()

if not chassis_module_name.startswith("SUPERVISOR") and \
not chassis_module_name.startswith("LINE-CARD") and \
not chassis_module_name.startswith("FABRIC-CARD") and \
not chassis_module_name.startswith("DPU"):
ctx.fail("'module_name' has to begin with 'SUPERVISOR', 'LINE-CARD', 'FABRIC-CARD', 'DPU'")
if not chassis_module_name.startswith(("SUPERVISOR", "LINE-CARD", "FABRIC-CARD", "DPU")):
ctx.fail("'module_name' has to begin with 'SUPERVISOR', 'LINE-CARD', 'FABRIC-CARD', or 'DPU'")
return

# To avoid duplicate operation
if get_config_module_state(db, chassis_module_name) == 'down':
click.echo("Module {} is already in down state".format(chassis_module_name))
click.echo(f"Module {chassis_module_name} is already in down state")
return

click.echo("Shutting down chassis module {}".format(chassis_module_name))
fvs = {'admin_status': 'down'}
config_db.set_entry('CHASSIS_MODULE', chassis_module_name, fvs)
if is_smartswitch():
if get_state_transition_in_progress(db, chassis_module_name) == 'True':
if is_transition_timed_out(db, chassis_module_name):
set_state_transition_in_progress(db, chassis_module_name, 'False')
click.echo(f"Previous transition for module {chassis_module_name} timed out. Proceeding with shutdown.")
else:
click.echo(f"Module {chassis_module_name} state transition is already in progress")
return
else:
set_state_transition_in_progress(db, chassis_module_name, 'True')

click.echo(f"Shutting down chassis module {chassis_module_name}")
fvs = {
'admin_status': 'down',
}
config_db.set_entry('CHASSIS_MODULE', chassis_module_name, fvs)
else:
click.echo(f"Shutting down chassis module {chassis_module_name}")
config_db.set_entry('CHASSIS_MODULE', chassis_module_name, {'admin_status': 'down'})

if chassis_module_name.startswith("FABRIC-CARD"):
if not check_config_module_state_with_timeout(ctx, db, chassis_module_name, 'down'):
fabric_module_set_admin_status(db, chassis_module_name, 'down')
Expand All @@ -149,16 +220,32 @@ def startup_chassis_module(db, chassis_module_name):
config_db = db.cfgdb
ctx = click.get_current_context()

# To avoid duplicate operation
if not chassis_module_name.startswith(("SUPERVISOR", "LINE-CARD", "FABRIC-CARD", "DPU")):
ctx.fail("'module_name' has to begin with 'SUPERVISOR', 'LINE-CARD', 'FABRIC-CARD', or 'DPU'")
return

if get_config_module_state(db, chassis_module_name) == 'up':
click.echo("Module {} is already set to up state".format(chassis_module_name))
click.echo(f"Module {chassis_module_name} is already set to up state")
return

click.echo("Starting up chassis module {}".format(chassis_module_name))
if is_smartswitch():
fvs = {'admin_status': 'up'}
if get_state_transition_in_progress(db, chassis_module_name) == 'True':
if is_transition_timed_out(db, chassis_module_name):
set_state_transition_in_progress(db, chassis_module_name, 'False')
click.echo(f"Previous transition for module {chassis_module_name} timed out. Proceeding with startup.")
else:
click.echo(f"Module {chassis_module_name} state transition is already in progress")
return
else:
set_state_transition_in_progress(db, chassis_module_name, 'True')
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Isnt the transition flag being set in chassisd? Do we need to set here as well?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@gpunathilell Not necessarily, however any script executing back to back CLI calls may pose issues if I remove this. So, it is safe to keep it in two places. If you are particular and if it works for you I can remove it.


click.echo(f"Starting up chassis module {chassis_module_name}")
fvs = {
'admin_status': 'up',
}
config_db.set_entry('CHASSIS_MODULE', chassis_module_name, fvs)
else:
click.echo(f"Starting up chassis module {chassis_module_name}")
config_db.set_entry('CHASSIS_MODULE', chassis_module_name, None)

if chassis_module_name.startswith("FABRIC-CARD"):
Expand Down
160 changes: 160 additions & 0 deletions tests/chassis_modules_test.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,12 @@
import sys
import os
from click.testing import CliRunner
from datetime import datetime, timedelta
from config.chassis_modules import (
set_state_transition_in_progress,
is_transition_timed_out,
TRANSITION_TIMEOUT
)

import show.main as show
import config.main as config
Expand Down Expand Up @@ -441,6 +447,160 @@ def test_show_and_verify_system_lags_output_lc4(self):
assert return_code == 0
assert result == show_chassis_system_lags_output_lc4

def test_shutdown_triggers_transition_tracking(self):
with mock.patch("config.chassis_modules.is_smartswitch", return_value=True), \
mock.patch("config.chassis_modules.get_config_module_state", return_value='up'):

runner = CliRunner()
db = Db()
result = runner.invoke(
config.config.commands["chassis"].commands["modules"].commands["shutdown"],
["DPU0"],
obj=db
)
print(result.exit_code)
print(result.output)
assert result.exit_code == 0

# Check CONFIG_DB for admin_status
cfg_fvs = db.cfgdb.get_entry("CHASSIS_MODULE", "DPU0")
admin_status = cfg_fvs.get("admin_status")
print(f"admin_status: {admin_status}")
assert admin_status == "down"

# Check STATE_DB for transition flags
state_fvs = db.db.get_all("STATE_DB", "CHASSIS_MODULE_TABLE|DPU0")
transition_flag = state_fvs.get("state_transition_in_progress")
transition_time = state_fvs.get("transition_start_time")

print(f"state_transition_in_progress: {transition_flag}")
print(f"transition_start_time: {transition_time}")

assert transition_flag == "True"
assert transition_time is not None

def test_shutdown_triggers_transition_in_progress(self):
with mock.patch("config.chassis_modules.is_smartswitch", return_value=True), \
mock.patch("config.chassis_modules.get_config_module_state", return_value='up'):

runner = CliRunner()
db = Db()

fvs = {
'admin_status': 'up',
'state_transition_in_progress': 'True',
'transition_start_time': datetime.utcnow().isoformat()
}
db.cfgdb.set_entry('CHASSIS_MODULE', "DPU0", fvs)

result = runner.invoke(
config.config.commands["chassis"].commands["modules"].commands["shutdown"],
["DPU0"],
obj=db
)
print(result.exit_code)
print(result.output)
assert result.exit_code == 0

fvs = db.db.get_all("STATE_DB", "CHASSIS_MODULE_TABLE|DPU0")
print(f"state_transition_in_progress:{fvs['state_transition_in_progress']}")
print(f"transition_start_time:{fvs['transition_start_time']}")

def test_shutdown_triggers_transition_timeout(self):
with mock.patch("config.chassis_modules.is_smartswitch", return_value=True), \
mock.patch("config.chassis_modules.get_config_module_state", return_value='up'):

runner = CliRunner()
db = Db()

fvs = {
'admin_status': 'up',
'state_transition_in_progress': 'True',
'transition_start_time': (datetime.utcnow() - timedelta(minutes=30)).isoformat()
}
db.cfgdb.set_entry('CHASSIS_MODULE', "DPU0", fvs)

result = runner.invoke(
config.config.commands["chassis"].commands["modules"].commands["shutdown"],
["DPU0"],
obj=db
)
print(result.exit_code)
print(result.output)
assert result.exit_code == 0

fvs = db.db.get_all("STATE_DB", "CHASSIS_MODULE_TABLE|DPU0")
print(f"state_transition_in_progress:{fvs['state_transition_in_progress']}")
print(f"transition_start_time:{fvs['transition_start_time']}")

def test_startup_triggers_transition_tracking(self):
with mock.patch("config.chassis_modules.is_smartswitch", return_value=True), \
mock.patch("config.chassis_modules.get_config_module_state", return_value='down'):

runner = CliRunner()
db = Db()
result = runner.invoke(
config.config.commands["chassis"].commands["modules"].commands["startup"],
["DPU0"],
obj=db
)
print(result.exit_code)
print(result.output)
assert result.exit_code == 0

fvs = db.db.get_all("STATE_DB", "CHASSIS_MODULE_TABLE|DPU0")
print(f"state_transition_in_progress:{fvs['state_transition_in_progress']}")
print(f"transition_start_time:{fvs['transition_start_time']}")

def test_set_state_transition_in_progress_sets_and_removes_timestamp(self):
db = mock.MagicMock()
db.statedb = mock.MagicMock()

# Case 1: Set to 'True' adds timestamp
db.statedb.get_entry.return_value = {}
set_state_transition_in_progress(db, "DPU0", "True")
args = db.statedb.set_entry.call_args[0]
updated_entry = args[2]
assert updated_entry["state_transition_in_progress"] == "True"
assert "transition_start_time" in updated_entry

# Case 2: Set to 'False' removes timestamp
db.statedb.get_entry.return_value = {
"state_transition_in_progress": "True",
"transition_start_time": "2025-05-01T01:00:00"
}
set_state_transition_in_progress(db, "DPU0", "False")
args = db.statedb.set_entry.call_args[0]
updated_entry = args[2]
assert updated_entry["state_transition_in_progress"] == "False"
assert "transition_start_time" not in updated_entry

def test_is_transition_timed_out_all_paths(self):
db = mock.MagicMock()
db.statedb = mock.MagicMock()

# Case 1: No entry
db.statedb.get_entry.return_value = None
assert is_transition_timed_out(db, "DPU0") is False

# Case 2: No transition_start_time
db.statedb.get_entry.return_value = {"state_transition_in_progress": "True"}
assert is_transition_timed_out(db, "DPU0") is False

# Case 3: Invalid format
db.statedb.get_entry.return_value = {"transition_start_time": "not-a-date"}
assert is_transition_timed_out(db, "DPU0") is False

# Case 4: Timed out
old_time = (datetime.utcnow() - TRANSITION_TIMEOUT - timedelta(seconds=1)).isoformat()
db.statedb.get_entry.return_value = {"transition_start_time": old_time}
assert is_transition_timed_out(db, "DPU0") is True

# Case 5: Not timed out yet
now = datetime.utcnow().isoformat()
db.statedb.get_entry.return_value = {"transition_start_time": now}
assert is_transition_timed_out(db, "DPU0") is False

@classmethod
def teardown_class(cls):
print("TEARDOWN")
Expand Down
Loading