11/*
2- * Copyright (c) 2019-2023 , NVIDIA CORPORATION. All rights reserved.
2+ * Copyright (c) 2019-2024 , NVIDIA CORPORATION. All rights reserved.
33 */
44
55#include <linux/limits.h>
@@ -1338,21 +1338,22 @@ static int enroot_container_export(void)
13381338 return (0 );
13391339}
13401340
1341- static int enroot_export_once ( struct container * container , struct shared_memory * shm )
1341+ static int enroot_export ( void )
13421342{
13431343 int ret ;
13441344
1345- if (atomic_fetch_add (& context .shm -> completed_tasks , 1 ) == context .job .local_task_count - 1 ) {
1346- /* Check if job was interrupted before it fully started. */
1347- if (context .shm -> started_tasks != context .job .local_task_count )
1348- return (0 );
1345+ if (context .container .save_path == NULL )
1346+ return (0 );
13491347
1350- ret = enroot_container_export ();
1351- if (ret < 0 )
1352- return (-1 );
1348+ /* Check if job was interrupted before it fully started. */
1349+ if (context . shm -> started_tasks != context . job . local_task_count )
1350+ return (0 );
13531351
1354- slurm_spank_log ("pyxis: exported container %s to %s" , context .container .name , context .container .save_path );
1355- }
1352+ ret = enroot_container_export ();
1353+ if (ret < 0 )
1354+ return (-1 );
1355+
1356+ slurm_spank_log ("pyxis: exported container %s to %s" , context .container .name , context .container .save_path );
13561357
13571358 return (0 );
13581359}
@@ -1365,18 +1366,29 @@ int slurm_spank_task_exit(spank_t sp, int ac, char **av)
13651366 if (!context .enabled )
13661367 return (0 );
13671368
1368- if (context .container .save_path == NULL )
1369- return (0 );
1369+ rv = 0 ;
1370+ /* Last task to exit does the container export and/or container cleanup, if needed. */
1371+ if (atomic_fetch_add (& context .shm -> completed_tasks , 1 ) == context .job .local_task_count - 1 ) {
1372+ ret = enroot_export ();
1373+ if (ret < 0 ) {
1374+ slurm_error ("pyxis: failed to export container %s to %s" , context .container .name , context .container .save_path );
1375+ rv = -1 ;
1376+ }
13701377
1371- ret = enroot_export_once (& context .container , context .shm );
1372- if (ret < 0 ) {
1373- slurm_error ("pyxis: failed to export container %s to %s" , context .container .name , context .container .save_path );
1374- goto fail ;
1375- }
1378+ /* Need to cleanup the temporary squashfs if the task running "enroot import" was interrupted. */
1379+ if (context .container .temporary_squashfs && context .container .squashfs_path != NULL )
1380+ unlink (context .container .squashfs_path );
13761381
1377- rv = 0 ;
1382+ if (context .container .temporary_rootfs ) {
1383+ slurm_info ("pyxis: removing container filesystem: %s" , context .container .name );
1384+
1385+ ret = enroot_exec_wait_ctx ((char * const []){ "enroot" , "remove" , "-f" , context .container .name , NULL });
1386+ if (ret < 0 )
1387+ slurm_info ("pyxis: failed to remove container filesystem: %s" , context .container .name );
1388+ }
1389+
1390+ }
13781391
1379- fail :
13801392 return (rv );
13811393}
13821394
@@ -1385,21 +1397,6 @@ int pyxis_slurmstepd_exit(spank_t sp, int ac, char **av)
13851397 int ret ;
13861398 int rv = 0 ;
13871399
1388- /* Need to cleanup the temporary squashfs if the task running "enroot import" was interrupted. */
1389- if (context .container .temporary_squashfs && context .container .squashfs_path != NULL )
1390- unlink (context .container .squashfs_path );
1391-
1392- if (context .container .temporary_rootfs ) {
1393- slurm_info ("pyxis: removing container filesystem: %s" , context .container .name );
1394-
1395- ret = enroot_exec_wait_ctx ((char * const []){ "enroot" , "remove" , "-f" , context .container .name , NULL });
1396- if (ret < 0 ) {
1397- slurm_error ("pyxis: failed to remove container filesystem: %s" , context .container .name );
1398- enroot_print_log_ctx ();
1399- rv = -1 ;
1400- }
1401- }
1402-
14031400 free (context .container .name );
14041401 free (context .container .squashfs_path );
14051402 free (context .container .save_path );
0 commit comments