-
Notifications
You must be signed in to change notification settings - Fork 429
Open
Description
When enabling nvidia-persistencd service on the host running nvidia-smi in a container takes much longer (2s vs. 24s) than without.
I did some investigation using strace and it seems like the communication using the unix socket (/var/run/nvidia-persistenced/socket) does not seem to be the issue.
[pid 10] ioctl(9</dev/nvidiactl>, _IOC(_IOC_READ|_IOC_WRITE, 0x46, 0x2a, 0x20), 0x7fff958d6530) = 0
[pid 10] ioctl(9</dev/nvidiactl>, _IOC(_IOC_READ|_IOC_WRITE, 0x46, 0x2a, 0x20), 0x7fff958d6580) = 0
[pid 10] stat("/var/run/nvidia-persistenced/socket", {st_mode=S_IFSOCK|0777, st_size=0, ...}) = 0
[pid 10] socket(AF_UNIX, SOCK_STREAM, 0) = 15<socket:[27497]>
[pid 10] connect(15<socket:[27497]>, {sa_family=AF_UNIX, sun_path="/var/run/nvidia-persistenced/socket"}, 37) = 0
[pid 10] rt_sigprocmask(SIG_SETMASK, ~[RTMIN RT_1], [], 8) = 0
[pid 10] getrlimit(RLIMIT_NOFILE, {rlim_cur=1073741816, rlim_max=1073741816}) = 0
[pid 10] mmap(NULL, 4294967296, PROT_READ|PROT_WRITE, MAP_PRIVATE|MAP_ANONYMOUS, -1, 0) = 0x7fac18000000
>> long hang here
[pid 10] mmap(NULL, 51539607552, PROT_READ|PROT_WRITE, MAP_PRIVATE|MAP_ANONYMOUS, -1, 0) = 0x7fa018000000
>> and also here
[pid 10] getpeername(15<socket:[27497]>, {sa_family=AF_UNIX, sun_path="/var/run/nvidia-persistenced/socket"}, [128->38]) = 0
[pid 10] getsockname(15<socket:[27497]>, {sa_family=AF_UNIX}, [128->2]) = 0
[pid 10] getsockopt(15<socket:[27497]>, SOL_SOCKET, SO_TYPE, [1], [4]) = 0
[pid 10] rt_sigprocmask(SIG_SETMASK, [], NULL, 8) = 0
[pid 10] rt_sigprocmask(SIG_SETMASK, ~[RTMIN RT_1], [], 8) = 0
[pid 10] rt_sigprocmask(SIG_SETMASK, [], NULL, 8) = 0
[pid 10] rt_sigprocmask(SIG_SETMASK, ~[RTMIN RT_1], [], 8) = 0
[pid 10] rt_sigprocmask(SIG_SETMASK, [], NULL, 8) = 0
[pid 10] rt_sigprocmask(SIG_SETMASK, ~[RTMIN RT_1], [], 8) = 0
[pid 10] write(15<socket:[27497]>, "\200\0\08\374\211\306]\0\0\0\0\0\0\0\2\0\0\210\276\0\0\0\1\0\0\0\2\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\20\0\0\0\0\0\0\0\0", 60) = 60
[pid 10] poll([{fd=15<socket:[27497]>, events=POLLIN}], 1, 50000) = 1 ([{fd=15, revents=POLLIN}])
[pid 10] read(15<socket:[27497]>, "\200\0\0 \374\211\306]\0\0\0\1\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\1", 9000) = 36
The system is running Docker 24.0.2 and nvidia-container-toolkit 1.17.6.
Metadata
Metadata
Assignees
Labels
No labels