Skip to content

Commit f23bafb

Browse files
committed
Move container cleanup to slurm_spank_task_exit
See https://bugs.schedmd.com/show_bug.cgi?id=19362 Signed-off-by: Felix Abecassis <fabecassis@nvidia.com>
1 parent cd45001 commit f23bafb

File tree

1 file changed

+32
-35
lines changed

1 file changed

+32
-35
lines changed

pyxis_slurmstepd.c

Lines changed: 32 additions & 35 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,5 @@
11
/*
2-
* Copyright (c) 2019-2023, NVIDIA CORPORATION. All rights reserved.
2+
* Copyright (c) 2019-2024, NVIDIA CORPORATION. All rights reserved.
33
*/
44

55
#include <linux/limits.h>
@@ -1338,21 +1338,22 @@ static int enroot_container_export(void)
13381338
return (0);
13391339
}
13401340

1341-
static int enroot_export_once(struct container *container, struct shared_memory *shm)
1341+
static int enroot_export(void)
13421342
{
13431343
int ret;
13441344

1345-
if (atomic_fetch_add(&context.shm->completed_tasks, 1) == context.job.local_task_count - 1) {
1346-
/* Check if job was interrupted before it fully started. */
1347-
if (context.shm->started_tasks != context.job.local_task_count)
1348-
return (0);
1345+
if (context.container.save_path == NULL)
1346+
return (0);
13491347

1350-
ret = enroot_container_export();
1351-
if (ret < 0)
1352-
return (-1);
1348+
/* Check if job was interrupted before it fully started. */
1349+
if (context.shm->started_tasks != context.job.local_task_count)
1350+
return (0);
13531351

1354-
slurm_spank_log("pyxis: exported container %s to %s", context.container.name, context.container.save_path);
1355-
}
1352+
ret = enroot_container_export();
1353+
if (ret < 0)
1354+
return (-1);
1355+
1356+
slurm_spank_log("pyxis: exported container %s to %s", context.container.name, context.container.save_path);
13561357

13571358
return (0);
13581359
}
@@ -1365,18 +1366,29 @@ int slurm_spank_task_exit(spank_t sp, int ac, char **av)
13651366
if (!context.enabled)
13661367
return (0);
13671368

1368-
if (context.container.save_path == NULL)
1369-
return (0);
1369+
rv = 0;
1370+
/* Last task to exit does the container export and/or container cleanup, if needed. */
1371+
if (atomic_fetch_add(&context.shm->completed_tasks, 1) == context.job.local_task_count - 1) {
1372+
ret = enroot_export();
1373+
if (ret < 0) {
1374+
slurm_error("pyxis: failed to export container %s to %s", context.container.name, context.container.save_path);
1375+
rv = -1;
1376+
}
13701377

1371-
ret = enroot_export_once(&context.container, context.shm);
1372-
if (ret < 0) {
1373-
slurm_error("pyxis: failed to export container %s to %s", context.container.name, context.container.save_path);
1374-
goto fail;
1375-
}
1378+
/* Need to cleanup the temporary squashfs if the task running "enroot import" was interrupted. */
1379+
if (context.container.temporary_squashfs && context.container.squashfs_path != NULL)
1380+
unlink(context.container.squashfs_path);
13761381

1377-
rv = 0;
1382+
if (context.container.temporary_rootfs) {
1383+
slurm_info("pyxis: removing container filesystem: %s", context.container.name);
1384+
1385+
ret = enroot_exec_wait_ctx((char *const[]){ "enroot", "remove", "-f", context.container.name, NULL });
1386+
if (ret < 0)
1387+
slurm_info("pyxis: failed to remove container filesystem: %s", context.container.name);
1388+
}
1389+
1390+
}
13781391

1379-
fail:
13801392
return (rv);
13811393
}
13821394

@@ -1385,21 +1397,6 @@ int pyxis_slurmstepd_exit(spank_t sp, int ac, char **av)
13851397
int ret;
13861398
int rv = 0;
13871399

1388-
/* Need to cleanup the temporary squashfs if the task running "enroot import" was interrupted. */
1389-
if (context.container.temporary_squashfs && context.container.squashfs_path != NULL)
1390-
unlink(context.container.squashfs_path);
1391-
1392-
if (context.container.temporary_rootfs) {
1393-
slurm_info("pyxis: removing container filesystem: %s", context.container.name);
1394-
1395-
ret = enroot_exec_wait_ctx((char *const[]){ "enroot", "remove", "-f", context.container.name, NULL });
1396-
if (ret < 0) {
1397-
slurm_error("pyxis: failed to remove container filesystem: %s", context.container.name);
1398-
enroot_print_log_ctx();
1399-
rv = -1;
1400-
}
1401-
}
1402-
14031400
free(context.container.name);
14041401
free(context.container.squashfs_path);
14051402
free(context.container.save_path);

0 commit comments

Comments
 (0)