Skip to content

Commit a4b386f

Browse files
committed
Move container cleanup to slurm_spank_task_exit on Slurm 25.05+
See https://support.schedmd.com/show_bug.cgi?id=19362 Signed-off-by: Felix Abecassis <fabecassis@nvidia.com>
1 parent 7bbf5ba commit a4b386f

File tree

1 file changed

+41
-21
lines changed

1 file changed

+41
-21
lines changed

pyxis_slurmstepd.c

Lines changed: 41 additions & 21 deletions
Original file line numberDiff line numberDiff line change
@@ -25,6 +25,7 @@
2525
#include <unistd.h>
2626

2727
#include <slurm/spank.h>
28+
#include <slurm/slurm_version.h>
2829

2930
#include "pyxis_slurmstepd.h"
3031
#include "common.h"
@@ -1458,15 +1459,45 @@ static int enroot_export(void)
14581459
return (0);
14591460
}
14601461

1462+
static int enroot_cleanup(void)
1463+
{
1464+
int ret;
1465+
int rv = 0;
1466+
1467+
/* Need to remove the temporary squashfs if the task was interrupted before cleanup. */
1468+
if (context.container.use_enroot_import && context.container.squashfs_path != NULL)
1469+
unlink(context.container.squashfs_path);
1470+
1471+
if (context.container.use_importer) {
1472+
ret = importer_exec_release(context.config.importer_path, context.job.uid, context.job.gid,
1473+
enroot_set_env);
1474+
if (ret < 0) {
1475+
slurm_info("pyxis: failed to call importer release");
1476+
rv = -1;
1477+
}
1478+
}
1479+
1480+
if (context.container.temporary_rootfs) {
1481+
slurm_info("pyxis: removing container filesystem: %s", context.container.name);
1482+
1483+
ret = enroot_exec_wait_ctx((char *const[]){ "enroot", "remove", "-f", context.container.name, NULL });
1484+
if (ret < 0) {
1485+
slurm_info("pyxis: failed to remove container filesystem: %s", context.container.name);
1486+
rv = -1;
1487+
}
1488+
}
1489+
1490+
return (rv);
1491+
}
1492+
14611493
int slurm_spank_task_exit(spank_t sp, int ac, char **av)
14621494
{
14631495
int ret;
1464-
int rv = -1;
1496+
int rv = 0;
14651497

14661498
if (!context.enabled)
14671499
return (0);
14681500

1469-
rv = 0;
14701501
/* Last task to exit does the container export and/or container cleanup, if needed. */
14711502
if (atomic_fetch_add(&context.shm->completed_tasks, 1) == context.job.local_task_count - 1) {
14721503
ret = enroot_export();
@@ -1475,25 +1506,9 @@ int slurm_spank_task_exit(spank_t sp, int ac, char **av)
14751506
rv = -1;
14761507
}
14771508

1478-
/* Need to remove the temporary squashfs if the task was interrupted before cleanup. */
1479-
if (context.container.use_enroot_import && context.container.squashfs_path != NULL)
1480-
unlink(context.container.squashfs_path);
1481-
1482-
if (context.container.use_importer) {
1483-
ret = importer_exec_release(context.config.importer_path, context.job.uid, context.job.gid,
1484-
enroot_set_env);
1485-
if (ret < 0)
1486-
slurm_info("pyxis: failed to call importer release");
1487-
}
1488-
1489-
if (context.container.temporary_rootfs) {
1490-
slurm_info("pyxis: removing container filesystem: %s", context.container.name);
1491-
1492-
ret = enroot_exec_wait_ctx((char *const[]){ "enroot", "remove", "-f", context.container.name, NULL });
1493-
if (ret < 0)
1494-
slurm_info("pyxis: failed to remove container filesystem: %s", context.container.name);
1495-
}
1496-
1509+
/* Slurm < 25.05: do cleanup here, before pam_finish: https://support.schedmd.com/show_bug.cgi?id=19362 */
1510+
if (SLURM_VERSION_NUMBER < SLURM_VERSION_NUM(25, 5, 0))
1511+
enroot_cleanup();
14971512
}
14981513

14991514
return (rv);
@@ -1504,6 +1519,11 @@ int pyxis_slurmstepd_exit(spank_t sp, int ac, char **av)
15041519
int ret;
15051520
int rv = 0;
15061521

1522+
if (context.enabled) {
1523+
if (SLURM_VERSION_NUMBER >= SLURM_VERSION_NUM(25, 5, 0))
1524+
enroot_cleanup();
1525+
}
1526+
15071527
free(context.container.name);
15081528
free(context.container.squashfs_path);
15091529
free(context.container.save_path);

0 commit comments

Comments
 (0)