Skip to content

Commit f4349e7

Browse files
[gNOI] Add support for gNOI Factory Reset
1 parent 1ee792e commit f4349e7

File tree

4 files changed

+752
-2
lines changed

4 files changed

+752
-2
lines changed

host_modules/gnoi_reset.py

Lines changed: 351 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,351 @@
1+
"""gNOI reset module which performs factory reset."""
2+
3+
import json
4+
import logging
5+
import threading
6+
import time
7+
from host_modules import host_service
8+
from host_modules import infra_host
9+
10+
MOD_NAME = "gnoi_reset"
11+
12+
# We don't execute any boot install commands to the non-switch-linux switches
13+
# because they don't have boot count as the switch-linux switches do.
14+
EXECUTE_BOOT_INSTALL_COMMAND = ""
15+
GET_BOOT_INSTALL_VALUE_COMMAND = ""
16+
EXECUTE_CLEANUP_COMMAND = []
17+
18+
# Timeout for SONiC Host Service to be killed during reboot. After executing the
19+
# reboot command, we will wait for 260 seconds for the reboot to complete, where
20+
# we expect that SONiC Host Service will be killed during this waiting period if
21+
# the reboot is successful. If this module is still alive after the waiting
22+
# period, we can conclude that the reboot has failed. Each container can take up
23+
# to 20 seconds to get killed. In total, there are 10 containers, and adding a
24+
# buffer of 1 minute brings up the delay value to be 260 seconds.
25+
REBOOT_TIMEOUT = 260
26+
27+
EXECUTE_COLD_REBOOT_COMMAND = "sudo reboot"
28+
29+
logger = logging.getLogger(__name__)
30+
31+
32+
class GnoiReset(host_service.HostModule):
33+
"""DBus endpoint that executes the factory reset and returns the reset
34+
35+
status and response.
36+
"""
37+
38+
def __init__(self, mod_name):
39+
self.lock = threading.Lock()
40+
self.is_reset_ongoing = False
41+
self.reset_request = {}
42+
self.reset_response = {}
43+
super(GnoiReset, self).__init__(mod_name)
44+
45+
def populate_reset_response(
46+
self,
47+
reset_success=True,
48+
factory_os_unsupported=False,
49+
zero_fill_unsupported=False,
50+
detail="",
51+
) -> tuple[int, str]:
52+
"""Populate the factory reset response.
53+
54+
Args:
55+
reset_success: A boolean type variable to indicate whether the factory
56+
reset succeeds or not.
57+
factory_os_unsupported: A boolean type variable to indicate whether the
58+
restoring to factory_os succeeds or not.
59+
zero_fill_unsupported: A boolean type variable to indicate whether the
60+
request to zero fill succeeds or not.
61+
detail: A string indicates the detailed error message of the factory
62+
reset if the error is not either factory_os_unsupported or
63+
zero_fill_unsupported.
64+
65+
Returns:
66+
A integer that indicates whether the factory reset succeeds or not,
67+
and a json-style of StartResponse protobuf defined in reset.proto.
68+
The integer value will be 0 if the factory reset succeeds, or 1 if
69+
there is any failure happens.
70+
71+
Examples of the return value:
72+
(0, dbus.String('{"reset_success": {}}'))
73+
(1, dbus.String('{
74+
"reset_error": {
75+
"other": true,
76+
"detail": "Previous reset is ongoing."
77+
}
78+
}')
79+
)
80+
"""
81+
self.lock.acquire()
82+
self.reset_response = {}
83+
if reset_success:
84+
self.reset_response["reset_success"] = {}
85+
else:
86+
self.reset_response["reset_error"] = {}
87+
if factory_os_unsupported:
88+
self.reset_response["reset_error"]["factory_os_unsupported"] = True
89+
elif zero_fill_unsupported:
90+
self.reset_response["reset_error"]["zero_fill_unsupported"] = True
91+
else:
92+
self.reset_response["reset_error"]["other"] = True
93+
self.reset_response["reset_error"]["detail"] = detail
94+
response_data = json.dumps(self.reset_response)
95+
self.lock.release()
96+
return 0 if reset_success else 1, response_data
97+
98+
def execute_reboot(self) -> None:
99+
"""Execute cold reboot and log the error.
100+
101+
when the reboot fails.
102+
"""
103+
rc, stdout, stderr = infra_host.InfraHost._run_command(
104+
EXECUTE_COLD_REBOOT_COMMAND
105+
)
106+
if rc:
107+
logger.error(
108+
"%s: Cold reboot failed execution with stdout: %s, stderr: %s.",
109+
MOD_NAME,
110+
stdout,
111+
stderr,
112+
)
113+
return
114+
115+
time.sleep(REBOOT_TIMEOUT)
116+
return
117+
118+
def _check_reboot_in_progress(self) -> tuple[int, str]:
119+
"""Checks if reboot is already in progress.
120+
121+
Returns:
122+
A integer that indicates whether the factory reset succeeds or not,
123+
and a json-style of StartResponse protobuf defined in reset.proto.
124+
The integer value will be 0 if the factory reset succeeds, or 1 if
125+
there is any failure happens.
126+
127+
Examples of the return value:
128+
(0, dbus.String('{"reset_success": {}}'))
129+
(1, dbus.String('{
130+
"reset_error": {
131+
"other": true,
132+
"detail": "Previous reset is ongoing."
133+
}
134+
}')
135+
)
136+
"""
137+
self.lock.acquire()
138+
is_reset_ongoing = self.is_reset_ongoing
139+
self.lock.release()
140+
141+
rc, stdout, stderr = infra_host.InfraHost._run_command(
142+
GET_BOOT_INSTALL_VALUE_COMMAND
143+
)
144+
if rc or not stdout:
145+
logger.error(
146+
"%s: Failed to get boot install value with stdout: %s, stderr: %s",
147+
MOD_NAME,
148+
stdout,
149+
stderr,
150+
)
151+
self.is_reset_ongoing = False
152+
return self.populate_reset_response(
153+
reset_success=False, detail="Failed to get the boot install value."
154+
)
155+
156+
# Example of a valid google-specific platform stdout here is:
157+
# ["regionselect=a", "bootcount=0", "bootinstall=0"].
158+
boot_install = 0
159+
try:
160+
boot_install = int(stdout[2].split("=")[1])
161+
except (ValueError, IndexError) as error:
162+
return self.populate_reset_response(
163+
reset_success=False,
164+
detail="Failed to get the boot install value with error: %s."
165+
% str(error),
166+
)
167+
168+
# Return without issuing the reset if the previous reset is ongoing.
169+
if is_reset_ongoing or boot_install != 0:
170+
return self.populate_reset_response(
171+
reset_success=False, detail="Previous reset is ongoing."
172+
)
173+
174+
return 0, ""
175+
176+
def _parse_arguments(self, options) -> tuple[int, str]:
177+
"""Parses and validates the given arguments into a reset request.
178+
179+
Args:
180+
options: A json-style string of StartRequest protobuf defined in
181+
factory_reset/reset.proto.
182+
183+
Returns:
184+
A integer that indicates whether the factory reset succeeds or not,
185+
and a json-style of StartResponse protobuf defined in reset.proto.
186+
The integer value will be 0 if the factory reset succeeds, or 1 if
187+
there is any failure happens.
188+
189+
Examples of the return value:
190+
(0, dbus.String('{"reset_success": {}}'))
191+
(1, dbus.String('{
192+
"reset_error": {
193+
"other": true,
194+
"detail": "Previous reset is ongoing."
195+
}
196+
}')
197+
)
198+
"""
199+
self.reset_request = {}
200+
try:
201+
self.reset_request = json.loads(options)
202+
except ValueError:
203+
return self.populate_reset_response(
204+
reset_success=False,
205+
detail=(
206+
"Failed to parse json formatted factory reset request "
207+
"into python dict."
208+
),
209+
)
210+
211+
# Reject the request if zero_fill is set.
212+
if "zeroFill" in self.reset_request and self.reset_request["zeroFill"]:
213+
return self.populate_reset_response(
214+
reset_success=False,
215+
zero_fill_unsupported=True,
216+
detail="zero_fill operation is currently unsupported.",
217+
)
218+
219+
# Issue a warning if retain_certs is set.
220+
if "retainCerts" in self.reset_request and self.reset_request["retainCerts"]:
221+
logger.warning("%s: retain_certs is currently ignored.", MOD_NAME)
222+
223+
return 0, ""
224+
225+
def _cleanup_images(self) -> None:
226+
"""Cleans up the installed images, preparing for a factory reset."""
227+
logger.info("Cleaning up install images.")
228+
# Cleanup all install artifacts.
229+
for command in EXECUTE_CLEANUP_COMMAND:
230+
rc, stdout, stderr = infra_host.InfraHost._run_command(command)
231+
if rc:
232+
# Cleaning up artifacts is best effort, so continue on failure.
233+
logger.warning(
234+
"%s: Command %s execution failed with stdout: %s, stderr: %s.",
235+
MOD_NAME,
236+
command,
237+
stdout,
238+
stderr,
239+
)
240+
241+
def _execute_reboot(self) -> tuple[int, str]:
242+
"""Performs a cold reboot, putting the switch into boot install mode.
243+
244+
Returns
245+
A integer that indicates whether the factory reset succeeds or not,
246+
and a json-style of StartResponse protobuf defined in reset.proto.
247+
The integer value will be 0 if the factory reset succeeds, or 1 if
248+
there is any failure happens.
249+
250+
Examples of the return value:
251+
(0, dbus.String('{"reset_success": {}}'))
252+
(1, dbus.String('{
253+
"reset_error": {
254+
"other": true,
255+
"detail": "Previous reset is ongoing."
256+
}
257+
}')
258+
)
259+
260+
Raises:
261+
RuntimeError: An error occurred when starting a new thread.
262+
"""
263+
# Issue the boot install command.
264+
rc, stdout, stderr = infra_host.InfraHost._run_command(
265+
EXECUTE_BOOT_INSTALL_COMMAND
266+
)
267+
if rc:
268+
logger.error(
269+
"%s: Boot count execution with stdout: %s, stderr: %s.",
270+
MOD_NAME,
271+
stdout,
272+
stderr,
273+
)
274+
self.is_reset_ongoing = False
275+
return self.populate_reset_response(
276+
reset_success=False, detail="Boot count execution failed."
277+
)
278+
279+
# Issue a cold reboot in a new thread and clear the reset response if
280+
# the reboot succeeds.
281+
try:
282+
t = threading.Thread(target=self.execute_reboot)
283+
t.start()
284+
except RuntimeError as error:
285+
self.is_reset_ongoing = False
286+
return self.populate_reset_response(
287+
reset_success=False,
288+
detail="Failed to start thread to execute reboot.",
289+
)
290+
291+
return 0, ""
292+
293+
@host_service.method(
294+
host_service.bus_name(MOD_NAME), in_signature="as", out_signature="is"
295+
)
296+
def issue_reset(self, options) -> tuple[int, str]:
297+
"""Issues the factory reset by performing the following steps
298+
299+
sequentially:
300+
1. Checks that there is no other reset requests ongoing.
301+
2. Issues a bootcount command to the switch if it runs switch-linux.
302+
3. Issues the cold reboot command to the switch.
303+
304+
Args:
305+
options: A json-style string of StartRequest protobuf defined in
306+
factory_reset/reset.proto.
307+
308+
Returns:
309+
A integer that indicates whether the factory reset succeeds or not,
310+
and a json-style of StartResponse protobuf defined in reset.proto.
311+
The integer value will be 0 always regardless of success or failure
312+
to ensure that the FE consumes the response correctly.
313+
314+
Examples of the return value:
315+
(0, dbus.String('{"reset_success": {}}'))
316+
(0, dbus.String('{
317+
"reset_error": {
318+
"other": true,
319+
"detail": "Previous reset is ongoing."
320+
}
321+
}')
322+
)
323+
324+
Raises:
325+
RuntimeError: An error occurred when starting a new thread.
326+
"""
327+
# Override the error code to always note success, so that the FE consumes
328+
# the response correctly.
329+
print("Issueing reset from Back end")
330+
rc, resp = self._parse_arguments(options)
331+
if rc:
332+
return 0, resp
333+
334+
rc, resp = self._check_reboot_in_progress()
335+
if rc:
336+
return 0, resp
337+
338+
self.is_reset_ongoing = True
339+
if "factoryOs" in self.reset_request and self.reset_request["factoryOs"]:
340+
self._cleanup_images()
341+
342+
rc, resp = self._execute_reboot()
343+
if rc:
344+
return 0, resp
345+
346+
return 0, self.populate_reset_response()[1]
347+
348+
349+
def register():
350+
"""Return the class name"""
351+
return GnoiReset, MOD_NAME

0 commit comments

Comments
 (0)