Skip to content

Commit 28d1d10

Browse files
Merge pull request NVIDIA#191 from hexinw-nvidia/socket_mismatch
fix: make launcher socket finding robust in rank monitor server test
2 parents 4dce778 + 4a02378 commit 28d1d10

File tree

1 file changed

+10
-7
lines changed

1 file changed

+10
-7
lines changed

tests/fault_tolerance/unit/test_rank_monitor_server.py

Lines changed: 10 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -48,17 +48,20 @@ def setUp(self):
4848
)
4949

5050
# Wait for the server to start and create its launcher socket
51-
# The launcher socket path is constructed as: f"{tempfile.gettempdir()}/_ft_launcher{pid}_to_rmon.socket"
51+
# The launcher socket path is constructed as: f"{tempfile.gettempdir()}/_ft_launcher{server_pid}_to_rmon.socket"
52+
server_pid = self.server_process.pid
53+
expected_socket_name = f"_ft_launcher{server_pid}_to_rmon.socket"
54+
self.launcher_socket_path = os.path.join(tempfile.gettempdir(), expected_socket_name)
55+
5256
max_wait = 5 # seconds
5357
start_time = time.time()
5458
while time.time() - start_time < max_wait:
55-
# Try to find the launcher socket file
56-
for file in os.listdir(tempfile.gettempdir()):
57-
if file.startswith("_ft_launcher") and file.endswith("_to_rmon.socket"):
58-
self.launcher_socket_path = os.path.join(tempfile.gettempdir(), file)
59-
return
59+
if os.path.exists(self.launcher_socket_path):
60+
return
6061
time.sleep(0.1)
61-
raise RuntimeError("Could not find launcher socket file after waiting")
62+
raise RuntimeError(
63+
f"Could not find launcher socket file {expected_socket_name} after waiting {max_wait} seconds"
64+
)
6265

6366
def tearDown(self):
6467
# Clean up the server process

0 commit comments

Comments
 (0)