We read every piece of feedback, and take your input very seriously.
To see all available qualifiers, see our documentation.
1 parent 466339c commit 2537ea8Copy full SHA for 2537ea8
1 file changed
src/swiss_ai_model_launch/assets/template.jinja
@@ -223,12 +223,12 @@ fi
223
# Launch DCGM exporter on the batch node to expose GPU metrics on port 9400.
224
# vmagent scrape config should include a job targeting localhost:9400 to collect
225
# these metrics alongside the framework metrics.
226
-if [ -x "$DCGM_EXPORTER_BIN" ]; then
+if [ -e /dev/nvidia0 ] && [ -x "$DCGM_EXPORTER_BIN" ]; then
227
"$DCGM_EXPORTER_BIN" \
228
- --address 0.0.0.0:9400 \
+ --address 0.0.0.0:9400 -f /capstor/store/cscs/swissai/infra01/ocf-share/default-counters.csv \
229
> /tmp/dcgm-exporter-${SLURM_JOB_ID}.log 2>&1 &
230
else
231
- echo "dcgm-exporter: $DCGM_EXPORTER_BIN not found, skipping" >&2
+ echo "dcgm-exporter: no NVIDIA GPU or binary not found, skipping" >&2
232
fi
233
234
# Optional router launch
0 commit comments