@@ -27,6 +27,10 @@ OCF_BOOTSTRAP_ADDR="/ip4/148.187.108.178/tcp/43905/p2p/QmbUKJkCfotDzbFE5uoTsXD4G
2727SERVED_MODEL_NAME=" {{ served_model_name }}"
2828METRICS_REMOTE_WRITE_URL=" {{ metrics_remote_write_url or '' }}"
2929METRICS_AGENT_BIN=" {{ metrics_agent_binary }}"
30+ DCGM_EXPORTER_BIN=" {{ dcgm_exporter_binary }}"
31+ USE_DCGM_EXPORTER={{ " false" if disable_dcgm_exporter else " true" }}
32+ USE_METRICS={{ " false" if disable_metrics else " true" }}
33+
3034
3135{% if telemetry_endpoint %}
3236curl -sf -X POST " {{ telemetry_endpoint }}" \
@@ -44,11 +48,13 @@ if [[ "$ARCH" == "aarch64" ]]; then
4448 export SP_NCCL_SO_PATH=/usr/lib/aarch64-linux-gnu/
4549 export OCF_BIN=/ocfbin/ocf-arm
4650 METRICS_AGENT_BIN=" ${METRICS_AGENT_BIN} -arm64"
51+ DCGM_EXPORTER_BIN=" ${DCGM_EXPORTER_BIN} -arm64"
4752elif [[ " $ARCH " == " x86_64" ]]; then
4853 echo " Running on x86_64"
4954 export SP_NCCL_SO_PATH=/usr/lib/x86_64-linux-gnu/
5055 export OCF_BIN=/ocfbin/ocf-amd64
5156 METRICS_AGENT_BIN=" ${METRICS_AGENT_BIN} -amd64"
57+ DCGM_EXPORTER_BIN=" ${DCGM_EXPORTER_BIN} -amd64"
5258else
5359 echo " Unknown architecture: $ARCH "
5460 exit 1
@@ -197,23 +203,69 @@ $FRAMEWORK_CMD" &
197203 done
198204done
199205
200- # Push framework metrics to Prometheus via vmagent (runs on the batch node).
201- # Pyxis containers share the host network namespace, so localhost:8080 on the
202- # batch node reaches the framework's API server. The scrape config is a static
203- # YAML staged alongside the binaries at /ocfbin/vmagent-scrape.yaml.
204- # NOTE: only the worker on the batch node is scraped — multi-worker setups
205- # would need a per-worker vmagent (future work).
206+ # Launch vmagent and DCGM on every node.
207+ # The batch node (index 0) runs directly; worker nodes run via srun --overlap.
208+ # The batch node scrapes framework metrics (8080) + DCGM (9400); workers scrape only DCGM (9400).
206209if [ -n " $METRICS_REMOTE_WRITE_URL " ] && [ -x " $METRICS_AGENT_BIN " ]; then
207- " $METRICS_AGENT_BIN " \
208- -promscrape.config=/capstor/store/cscs/swissai/infra01/ocf-share/vmagent-scrape.yaml \
209- -remoteWrite.url=" ${METRICS_REMOTE_WRITE_URL} " \
210- -remoteWrite.label=" slurm_job_id=${SLURM_JOB_ID} " \
211- -remoteWrite.label=" model=${SERVED_MODEL_NAME} " \
212- -remoteWrite.label=" framework=${FRAMEWORK} " \
213- -remoteWrite.label=" user=${USER} " \
214- -remoteWrite.tmpDataPath=/tmp/vmagent-data-${SLURM_JOB_ID} \
215- > /tmp/vmagent-${SLURM_JOB_ID} .log 2>&1 &
216- elif [ -n " $METRICS_REMOTE_WRITE_URL " ]; then
210+ VMAGENT_COMMON_ARGS="
211+ -remoteWrite.url=${METRICS_REMOTE_WRITE_URL}
212+ -remoteWrite.label=slurm_job_id=${SLURM_JOB_ID}
213+ -remoteWrite.label=model=${SERVED_MODEL_NAME}
214+ -remoteWrite.label=framework=${FRAMEWORK}
215+ -remoteWrite.label=user=${USER}
216+ "
217+ METRICS_CONFIG_DIR=" /capstor/store/cscs/swissai/infra01/ocf-share"
218+ DCGM_COMMON_ARGS=" --address 0.0.0.0:9400 -f $METRICS_CONFIG_DIR /default-counters.csv"
219+ DCGM_LOG=" /tmp/dcgm-exporter-${SLURM_JOB_ID} .log"
220+ VMAGENT_LOG=" /tmp/vmagent-${SLURM_JOB_ID} .log"
221+ VMAGENT_DATA=" /tmp/vmagent-data-${SLURM_JOB_ID} "
222+
223+ if [ " $USE_METRICS " != " true" ]; then
224+ echo " Metrics disabled, skipping vmagent initialization" >&2
225+ else
226+ for i in " ${! nodes[@]} " ; do
227+ node=" ${nodes[$i]} "
228+ if [ " $i " -eq 0 ]; then
229+ if [ " $USE_DCGM_EXPORTER " = " true" ]; then
230+ echo " dcgm-exporter: /dev/nvidia0 exists=$( [ -e /dev/nvidia0 ] && echo yes || echo no) , binary=$DCGM_EXPORTER_BIN executable=$( [ -x " $DCGM_EXPORTER_BIN " ] && echo yes || echo no) " >&2
231+ if [ -e /dev/nvidia0 ] && [ -x " $DCGM_EXPORTER_BIN " ]; then
232+ " $DCGM_EXPORTER_BIN " $DCGM_COMMON_ARGS > $DCGM_LOG 2>&1 &
233+ else
234+ echo " dcgm-exporter: no NVIDIA GPU or binary not found, skipping" >&2
235+ fi
236+ VMAGENT_SCRAPE_CONFIG=" $METRICS_CONFIG_DIR /vmagent-scrape.yaml"
237+ else
238+ echo " dcgm-exporter: disabled (USE_DCGM_EXPORTER != true), skipping" >&2
239+ VMAGENT_SCRAPE_CONFIG=" $METRICS_CONFIG_DIR /vmagent-scrape-no-dcgm.yaml"
240+ fi
241+ " $METRICS_AGENT_BIN " $VMAGENT_COMMON_ARGS \
242+ -promscrape.config=$VMAGENT_SCRAPE_CONFIG \
243+ -remoteWrite.label=" node=$( hostname) " \
244+ -remoteWrite.tmpDataPath=$VMAGENT_DATA \
245+ > $VMAGENT_LOG 2>&1 &
246+ else
247+ srun --nodes=1 --ntasks=1 --nodelist=$node --overlap \
248+ bash -c "
249+ if [ \" $USE_DCGM_EXPORTER \" = 'true' ]; then
250+ if [ -e /dev/nvidia0 ] && [ -x \" $DCGM_EXPORTER_BIN \" ]; then
251+ \" $DCGM_EXPORTER_BIN \" $DCGM_COMMON_ARGS > $DCGM_LOG 2>&1 &
252+ else
253+ echo 'dcgm-exporter: no NVIDIA GPU or binary not found, skipping' >&2
254+ fi
255+ \" $METRICS_AGENT_BIN \" $VMAGENT_COMMON_ARGS \
256+ -promscrape.config=$METRICS_CONFIG_DIR /vmagent-scrape-dcgm-only.yaml \
257+ -remoteWrite.label=\" node=\$ (hostname)\" \
258+ -remoteWrite.tmpDataPath=$VMAGENT_DATA \
259+ > $VMAGENT_LOG 2>&1 &
260+ else
261+ echo 'dcgm-exporter: disabled (USE_DCGM_EXPORTER != true), skipping vmagent on worker node' >&2
262+ fi
263+ wait
264+ " &
265+ fi
266+ done
267+ fi
268+ else
217269 echo " metrics: $METRICS_AGENT_BIN not found, skipping push" >&2
218270fi
219271
0 commit comments