File tree Expand file tree Collapse file tree 1 file changed +18
-2
lines changed
examples/machine-learning/a3-megagpu-8g Expand file tree Collapse file tree 1 file changed +18
-2
lines changed Original file line number Diff line number Diff line change @@ -459,8 +459,24 @@ deployment_groups:
459459 - name: Configure Slurm to depend upon aperture devices
460460 hosts: all
461461 become: true
462- vars: {}
463462 tasks:
463+ - name: Create wait script for aperture devices
464+ ansible.builtin.copy:
465+ dest: /usr/local/bin/wait-for-aperture-devices.sh
466+ owner: root
467+ group: root
468+ mode: 0o755
469+ content: |
470+ #!/bin/bash
471+ # Wait up to 60 seconds for aperture devices to appear.
472+ for i in \$(seq 30); do
473+ # Check if the directory is not empty
474+ if [ -n "\$(ls -A /dev/aperture_devices 2>/dev/null)" ]; then
475+ exit 0
476+ fi
477+ sleep 2
478+ done
479+ exit 1
464480 - name: Ensure slurmd starts after aperture devices are ready
465481 ansible.builtin.copy:
466482 dest: /etc/systemd/system/slurmd.service.d/aperture.conf
@@ -469,7 +485,7 @@ deployment_groups:
469485 mode: 0o644
470486 content: |
471487 [Service]
472- ExecCondition=/usr/bin/test -d /dev/aperture_devices/
488+ ExecCondition=/usr/local/ bin/wait-for-aperture-devices.sh
473489 notify: Reload SystemD
474490 handlers:
475491 - name: Reload SystemD
You can’t perform that action at this time.
0 commit comments