Skip to content

Commit 22a079a

Browse files
Fix transient aperture device error in A3M slurm yaml (#5114)
1 parent 59f4df2 commit 22a079a

File tree

1 file changed

+18
-2
lines changed

1 file changed

+18
-2
lines changed

examples/machine-learning/a3-megagpu-8g/a3mega-slurm-blueprint.yaml

Lines changed: 18 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -459,8 +459,24 @@ deployment_groups:
459459
- name: Configure Slurm to depend upon aperture devices
460460
hosts: all
461461
become: true
462-
vars: {}
463462
tasks:
463+
- name: Create wait script for aperture devices
464+
ansible.builtin.copy:
465+
dest: /usr/local/bin/wait-for-aperture-devices.sh
466+
owner: root
467+
group: root
468+
mode: 0o755
469+
content: |
470+
#!/bin/bash
471+
# Wait up to 60 seconds for aperture devices to appear.
472+
for i in \$(seq 30); do
473+
# Check if the directory is not empty
474+
if [ -n "\$(ls -A /dev/aperture_devices 2>/dev/null)" ]; then
475+
exit 0
476+
fi
477+
sleep 2
478+
done
479+
exit 1
464480
- name: Ensure slurmd starts after aperture devices are ready
465481
ansible.builtin.copy:
466482
dest: /etc/systemd/system/slurmd.service.d/aperture.conf
@@ -469,7 +485,7 @@ deployment_groups:
469485
mode: 0o644
470486
content: |
471487
[Service]
472-
ExecCondition=/usr/bin/test -d /dev/aperture_devices/
488+
ExecCondition=/usr/local/bin/wait-for-aperture-devices.sh
473489
notify: Reload SystemD
474490
handlers:
475491
- name: Reload SystemD

0 commit comments

Comments
 (0)