Skip to content

Commit 6ee9840

Browse files
authored
fix(exporters): socket name too long for long hostnames (#921)
Signed-off-by: Tomasz Grzegorzek <tgrzegorzek@nvidia.com>
1 parent aabafa0 commit 6ee9840

2 files changed

Lines changed: 209 additions & 10 deletions

File tree

  • packages/nemo-evaluator-launcher

packages/nemo-evaluator-launcher/src/nemo_evaluator_launcher/exporters/utils.py

Lines changed: 14 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -15,6 +15,7 @@
1515
#
1616
"""Shared utilities for metrics and configuration handling."""
1717

18+
import hashlib
1819
import re
1920
import subprocess
2021
from dataclasses import dataclass, field
@@ -256,6 +257,7 @@ def load_benchmark_info(artifacts_dir: Path) -> Tuple[str, str]:
256257

257258
# SSH connections directory
258259
CONNECTIONS_DIR = Path.home() / ".nemo-evaluator" / "connections"
260+
UNIX_SOCKET_MAX_PATH_LEN = 100 # conservative limit to avoid "socket path too long" errors on Linux (108 chars max, but we need to leave room for the and .sock suffix)
259261

260262

261263
def ssh_setup_masters(remotes: List[Tuple[str, str]]) -> Dict[Tuple[str, str], str]:
@@ -271,7 +273,18 @@ def ssh_setup_masters(remotes: List[Tuple[str, str]]) -> Dict[Tuple[str, str], s
271273
CONNECTIONS_DIR.mkdir(parents=True, exist_ok=True)
272274
control_paths: Dict[Tuple[str, str], str] = {}
273275
for username, hostname in remotes:
274-
socket_path = str(CONNECTIONS_DIR / f"{username}_{hostname}.sock")
276+
m = hashlib.md5()
277+
m.update(f"{username}@{hostname}".encode("utf-8"))
278+
socket_length = (
279+
UNIX_SOCKET_MAX_PATH_LEN - len(str(CONNECTIONS_DIR)) - 16
280+
) # for multiplexing
281+
if socket_length <= 16:
282+
# managing collision risk
283+
raise RuntimeError(
284+
f"Base connections directory path is too long for SSH control socket: {CONNECTIONS_DIR}"
285+
)
286+
socket_name = str(int(m.hexdigest(), 16))[:socket_length]
287+
socket_path = str(CONNECTIONS_DIR / f"{socket_name}.sock")
275288
socket_or_none = open_master_connection(
276289
username=username,
277290
hostname=hostname,

packages/nemo-evaluator-launcher/tests/unit_tests/exporters/test_utils.py

Lines changed: 195 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -343,19 +343,205 @@ def test_extract_with_missing_artifacts_dir(self, tmp_path: Path):
343343

344344

345345
class TestSSHHelpers:
346-
def test_setup_and_cleanup_masters(self):
347-
remotes = [("user", "host"), ("user", "host"), ("other", "otherhost")]
346+
def test_ssh_socket_path_generation(self):
347+
"""Test that SSH socket paths are generated correctly with MD5 hashing."""
348+
remotes = [("user", "hostname")]
348349

349350
with patch(
350-
"subprocess.run", return_value=SimpleNamespace(returncode=0)
351-
) as mock_run:
351+
"nemo_evaluator_launcher.exporters.utils.open_master_connection",
352+
return_value="/path/to/socket.sock",
353+
) as mock_open:
352354
cp = U.ssh_setup_masters(remotes)
353-
assert len(cp) == 2 # Two unique remote combinations
355+
356+
# Verify socket path was generated
357+
assert len(cp) == 1
358+
assert ("user", "hostname") in cp
359+
360+
# Verify open_master_connection was called with proper parameters
361+
mock_open.assert_called_once()
362+
call_kwargs = mock_open.call_args
363+
assert call_kwargs.kwargs["username"] == "user"
364+
assert call_kwargs.kwargs["hostname"] == "hostname"
365+
assert call_kwargs.kwargs["socket"].endswith(".sock")
366+
367+
def test_ssh_socket_name_uses_md5_hash(self):
368+
"""Test that socket name is derived from MD5 hash of username@hostname."""
369+
import hashlib
370+
371+
remotes = [("testuser", "testhost")]
372+
373+
with patch(
374+
"nemo_evaluator_launcher.exporters.utils.open_master_connection",
375+
return_value="/path/to/socket.sock",
376+
) as mock_open:
377+
U.ssh_setup_masters(remotes)
378+
379+
# Calculate expected hash
380+
m = hashlib.md5()
381+
m.update("testuser@testhost".encode("utf-8"))
382+
expected_hash_int = int(m.hexdigest(), 16)
383+
384+
# Extract socket path from call
385+
socket_path = mock_open.call_args.kwargs["socket"]
386+
socket_name = Path(socket_path).stem # Get filename without .sock
387+
388+
# Verify socket name starts with the hash conversion
389+
assert socket_name == str(expected_hash_int)[: len(socket_name)]
390+
391+
def test_ssh_socket_path_length_limit(self):
392+
"""Test that socket paths respect UNIX_SOCKET_MAX_PATH_LEN limit."""
393+
remotes = [("user", "host")]
394+
395+
with patch(
396+
"nemo_evaluator_launcher.exporters.utils.open_master_connection",
397+
return_value="/path/to/socket.sock",
398+
) as mock_open:
399+
U.ssh_setup_masters(remotes)
400+
401+
socket_path = mock_open.call_args.kwargs["socket"]
402+
# Socket path should be less than the max length
403+
assert len(socket_path) <= U.UNIX_SOCKET_MAX_PATH_LEN
404+
405+
def test_ssh_socket_path_too_long_connections_dir(self, tmp_path: Path):
406+
"""Test that RuntimeError is raised when CONNECTIONS_DIR path is too long."""
407+
# Create a very long path
408+
long_base = tmp_path / ("x" * 200)
409+
410+
remotes = [("user", "host")]
411+
412+
with patch(
413+
"nemo_evaluator_launcher.exporters.utils.CONNECTIONS_DIR", long_base
414+
):
415+
with pytest.raises(
416+
RuntimeError,
417+
match="Base connections directory path is too long for SSH control socket",
418+
):
419+
U.ssh_setup_masters(remotes)
420+
421+
def test_ssh_socket_name_truncation(self):
422+
"""Test that socket name is properly truncated to fit within path length limit."""
423+
424+
remotes = [("verylongusername", "verylonghostname.example.com")]
425+
426+
with patch(
427+
"nemo_evaluator_launcher.exporters.utils.open_master_connection",
428+
return_value="/path/to/socket.sock",
429+
) as mock_open:
430+
U.ssh_setup_masters(remotes)
431+
432+
socket_path = mock_open.call_args.kwargs["socket"]
433+
socket_name = Path(socket_path).stem
434+
435+
# Calculate expected maximum socket name length
436+
connections_dir_len = len(str(U.CONNECTIONS_DIR))
437+
max_socket_name_len = U.UNIX_SOCKET_MAX_PATH_LEN - connections_dir_len - 16
438+
439+
# Verify socket name length doesn't exceed the calculated maximum
440+
assert len(socket_name) <= max_socket_name_len
441+
442+
# Verify it's still a valid integer string (truncated hash)
443+
assert socket_name.isdigit()
444+
445+
def test_ssh_multiple_remotes_unique_sockets(self):
446+
"""Test that multiple remotes get unique socket paths."""
447+
remotes = [("user1", "host1"), ("user2", "host2"), ("user1", "host2")]
448+
449+
socket_paths = []
450+
451+
def mock_open_connection(username, hostname, socket):
452+
socket_paths.append(socket)
453+
return socket
454+
455+
with patch(
456+
"nemo_evaluator_launcher.exporters.utils.open_master_connection",
457+
side_effect=mock_open_connection,
458+
):
459+
cp = U.ssh_setup_masters(remotes)
460+
461+
# All socket paths should be unique
462+
assert len(set(socket_paths)) == 3
463+
assert len(cp) == 3
464+
465+
# Each remote should have its own socket
466+
assert ("user1", "host1") in cp
467+
assert ("user2", "host2") in cp
468+
assert ("user1", "host2") in cp
469+
470+
def test_ssh_duplicate_remotes_deduplicated(self):
471+
"""Test that duplicate remote entries are handled correctly."""
472+
# Note: Currently ssh_setup_masters doesn't deduplicate, it processes all
473+
# This test documents current behavior
474+
remotes = [("user", "host"), ("user", "host")]
475+
476+
call_count = 0
477+
478+
def mock_open_connection(username, hostname, socket):
479+
nonlocal call_count
480+
call_count += 1
481+
return socket
482+
483+
with patch(
484+
"nemo_evaluator_launcher.exporters.utils.open_master_connection",
485+
side_effect=mock_open_connection,
486+
):
487+
cp = U.ssh_setup_masters(remotes)
488+
489+
# Both calls are made (no deduplication in the function)
490+
assert call_count == 2
491+
# But only one entry in the dict (last one wins)
492+
assert len(cp) == 1
354493
assert ("user", "host") in cp
355-
assert ("other", "otherhost") in cp
356-
assert cp[("user", "host")].endswith("user_host.sock")
357-
U.ssh_cleanup_masters(cp)
358-
assert mock_run.call_count >= 2
494+
495+
def test_ssh_socket_path_consistent_for_same_remote(self):
496+
"""Test that the same remote always generates the same socket path."""
497+
import hashlib
498+
499+
username, hostname = "testuser", "testhost"
500+
remotes = [(username, hostname)]
501+
502+
# Calculate expected socket name
503+
m = hashlib.md5()
504+
m.update(f"{username}@{hostname}".encode("utf-8"))
505+
connections_dir_len = len(str(U.CONNECTIONS_DIR))
506+
socket_length = U.UNIX_SOCKET_MAX_PATH_LEN - connections_dir_len - 16
507+
expected_socket_name = str(int(m.hexdigest(), 16))[:socket_length]
508+
expected_socket_path = str(U.CONNECTIONS_DIR / f"{expected_socket_name}.sock")
509+
510+
with patch(
511+
"nemo_evaluator_launcher.exporters.utils.open_master_connection",
512+
return_value=expected_socket_path,
513+
) as mock_open:
514+
U.ssh_setup_masters(remotes)
515+
516+
# Verify the socket path matches expected
517+
actual_socket_path = mock_open.call_args.kwargs["socket"]
518+
assert actual_socket_path == expected_socket_path
519+
520+
def test_ssh_empty_remotes_list(self):
521+
"""Test that empty remotes list returns empty control_paths dict."""
522+
cp = U.ssh_setup_masters([])
523+
assert cp == {}
524+
525+
def test_ssh_socket_fails_open_connection_returns_none(self):
526+
"""Test that when open_master_connection returns None, socket is not added to control_paths."""
527+
remotes = [("user1", "host1"), ("user2", "host2")]
528+
529+
def mock_open_connection(username, hostname, socket):
530+
# First connection succeeds, second fails
531+
if username == "user1":
532+
return socket
533+
return None
534+
535+
with patch(
536+
"nemo_evaluator_launcher.exporters.utils.open_master_connection",
537+
side_effect=mock_open_connection,
538+
):
539+
cp = U.ssh_setup_masters(remotes)
540+
541+
# Only successful connection should be in control_paths
542+
assert len(cp) == 1
543+
assert ("user1", "host1") in cp
544+
assert ("user2", "host2") not in cp
359545

360546
def test_download_artifacts_only_required_with_logs(self, tmp_path: Path):
361547
class FakePopen:

0 commit comments

Comments
 (0)