Skip to content

Commit 2ec4847

Browse files
committed
Use a consistent container name prefix with container_scope=job
This fixes the common use-case of attaching to a running container that did not use --container-name and thus is using the default name of "$SLURM_JOB_ID.$SLURM_STEP_ID". Signed-off-by: Felix Abecassis <fabecassis@nvidia.com>
1 parent 3b408d3 commit 2ec4847

File tree

3 files changed

+21
-6
lines changed

3 files changed

+21
-6
lines changed

pyxis_slurmd.c

Lines changed: 3 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,5 @@
11
/*
2-
* Copyright (c) 2019-2020, NVIDIA CORPORATION. All rights reserved.
2+
* Copyright (c) 2019-2023, NVIDIA CORPORATION. All rights reserved.
33
*/
44

55
#include <errno.h>
@@ -54,7 +54,7 @@ static int pyxis_container_cleanup(uid_t uid, gid_t gid, uint32_t jobid)
5454
int ret;
5555
FILE *fp = NULL;
5656
char *name = NULL;
57-
uint32_t id, stepid;
57+
uint32_t id;
5858
int n;
5959
int rv = -1;
6060

@@ -67,8 +67,7 @@ static int pyxis_container_cleanup(uid_t uid, gid_t gid, uint32_t jobid)
6767

6868
while ((name = get_line_from_file(fp)) != NULL) {
6969
/* Remove named and unnamed pyxis containers for this job */
70-
if (sscanf(name, "pyxis_%u.%u%n", &id, &stepid, &n) == 2 ||
71-
sscanf(name, "pyxis_%u_%*s%n", &id, &n) == 1) {
70+
if (sscanf(name, "pyxis_%u_%*s%n", &id, &n) == 1) {
7271
if (strlen(name) != n || id != jobid)
7372
continue;
7473

pyxis_slurmstepd.c

Lines changed: 5 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,5 @@
11
/*
2-
* Copyright (c) 2019-2022, NVIDIA CORPORATION. All rights reserved.
2+
* Copyright (c) 2019-2023, NVIDIA CORPORATION. All rights reserved.
33
*/
44

55
#include <linux/limits.h>
@@ -1001,7 +1001,10 @@ int slurm_spank_user_init(spank_t sp, int ac, char **av)
10011001
context.container.name = container_name;
10021002
container_name = NULL;
10031003
} else {
1004-
ret = xasprintf(&context.container.name, "pyxis_%u.%u", context.job.jobid, context.job.stepid);
1004+
if (context.config.container_scope == SCOPE_JOB)
1005+
ret = xasprintf(&context.container.name, "pyxis_%u_%u.%u", context.job.jobid, context.job.jobid, context.job.stepid);
1006+
else
1007+
ret = xasprintf(&context.container.name, "pyxis_%u.%u", context.job.jobid, context.job.stepid);
10051008
if (ret < 0)
10061009
goto fail;
10071010
context.container.temporary_rootfs = true;

tests/exec.bats

Lines changed: 13 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -30,6 +30,19 @@ function teardown() {
3030
run_srun --overlap --container-name=exec-test findmnt /mymnt
3131
}
3232

33+
@test "attach to running unnamed container" {
34+
run_srun sh -c 'echo $SLURM_JOB_ID'
35+
job_id="${lines[-1]}"
36+
run_srun sh -c 'echo $SLURM_STEP_ID'
37+
step_id="${lines[-1]}"
38+
container_name="${job_id}.$((step_id + 1))"
39+
40+
run_srun --container-image=ubuntu:22.04 sh -c 'sleep 10s' &
41+
42+
sleep 3s
43+
run_srun --overlap --container-name="${container_name}" true
44+
}
45+
3346
@test "attach to running container after directory change" {
3447
run_srun --container-image=ubuntu:20.04 --container-name=exec-test bash -c "cd /var && sleep 30s" &
3548

0 commit comments

Comments
 (0)