Skip to content

Commit b2867a4

Browse files
Feature/DCGM Exporter (#98)
1 parent eef7d31 commit b2867a4

3 files changed

Lines changed: 85 additions & 16 deletions

File tree

src/swiss_ai_model_launch/assets/template.jinja

Lines changed: 68 additions & 16 deletions
Original file line numberDiff line numberDiff line change
@@ -27,6 +27,10 @@ OCF_BOOTSTRAP_ADDR="/ip4/148.187.108.178/tcp/43905/p2p/QmbUKJkCfotDzbFE5uoTsXD4G
2727
SERVED_MODEL_NAME="{{ served_model_name }}"
2828
METRICS_REMOTE_WRITE_URL="{{ metrics_remote_write_url or '' }}"
2929
METRICS_AGENT_BIN="{{ metrics_agent_binary }}"
30+
DCGM_EXPORTER_BIN="{{ dcgm_exporter_binary }}"
31+
USE_DCGM_EXPORTER={{ "false" if disable_dcgm_exporter else "true" }}
32+
USE_METRICS={{ "false" if disable_metrics else "true" }}
33+
3034

3135
{% if telemetry_endpoint %}
3236
curl -sf -X POST "{{ telemetry_endpoint }}" \
@@ -44,11 +48,13 @@ if [[ "$ARCH" == "aarch64" ]]; then
4448
export SP_NCCL_SO_PATH=/usr/lib/aarch64-linux-gnu/
4549
export OCF_BIN=/ocfbin/ocf-arm
4650
METRICS_AGENT_BIN="${METRICS_AGENT_BIN}-arm64"
51+
DCGM_EXPORTER_BIN="${DCGM_EXPORTER_BIN}-arm64"
4752
elif [[ "$ARCH" == "x86_64" ]]; then
4853
echo "Running on x86_64"
4954
export SP_NCCL_SO_PATH=/usr/lib/x86_64-linux-gnu/
5055
export OCF_BIN=/ocfbin/ocf-amd64
5156
METRICS_AGENT_BIN="${METRICS_AGENT_BIN}-amd64"
57+
DCGM_EXPORTER_BIN="${DCGM_EXPORTER_BIN}-amd64"
5258
else
5359
echo "Unknown architecture: $ARCH"
5460
exit 1
@@ -197,23 +203,69 @@ $FRAMEWORK_CMD" &
197203
done
198204
done
199205

200-
# Push framework metrics to Prometheus via vmagent (runs on the batch node).
201-
# Pyxis containers share the host network namespace, so localhost:8080 on the
202-
# batch node reaches the framework's API server. The scrape config is a static
203-
# YAML staged alongside the binaries at /ocfbin/vmagent-scrape.yaml.
204-
# NOTE: only the worker on the batch node is scraped — multi-worker setups
205-
# would need a per-worker vmagent (future work).
206+
# Launch vmagent and DCGM on every node.
207+
# The batch node (index 0) runs directly; worker nodes run via srun --overlap.
208+
# The batch node scrapes framework metrics (8080) + DCGM (9400); workers scrape only DCGM (9400).
206209
if [ -n "$METRICS_REMOTE_WRITE_URL" ] && [ -x "$METRICS_AGENT_BIN" ]; then
207-
"$METRICS_AGENT_BIN" \
208-
-promscrape.config=/capstor/store/cscs/swissai/infra01/ocf-share/vmagent-scrape.yaml \
209-
-remoteWrite.url="${METRICS_REMOTE_WRITE_URL}" \
210-
-remoteWrite.label="slurm_job_id=${SLURM_JOB_ID}" \
211-
-remoteWrite.label="model=${SERVED_MODEL_NAME}" \
212-
-remoteWrite.label="framework=${FRAMEWORK}" \
213-
-remoteWrite.label="user=${USER}" \
214-
-remoteWrite.tmpDataPath=/tmp/vmagent-data-${SLURM_JOB_ID} \
215-
> /tmp/vmagent-${SLURM_JOB_ID}.log 2>&1 &
216-
elif [ -n "$METRICS_REMOTE_WRITE_URL" ]; then
210+
VMAGENT_COMMON_ARGS="
211+
-remoteWrite.url=${METRICS_REMOTE_WRITE_URL}
212+
-remoteWrite.label=slurm_job_id=${SLURM_JOB_ID}
213+
-remoteWrite.label=model=${SERVED_MODEL_NAME}
214+
-remoteWrite.label=framework=${FRAMEWORK}
215+
-remoteWrite.label=user=${USER}
216+
"
217+
METRICS_CONFIG_DIR="/capstor/store/cscs/swissai/infra01/ocf-share"
218+
DCGM_COMMON_ARGS="--address 0.0.0.0:9400 -f $METRICS_CONFIG_DIR/default-counters.csv"
219+
DCGM_LOG="/tmp/dcgm-exporter-${SLURM_JOB_ID}.log"
220+
VMAGENT_LOG="/tmp/vmagent-${SLURM_JOB_ID}.log"
221+
VMAGENT_DATA="/tmp/vmagent-data-${SLURM_JOB_ID}"
222+
223+
if [ "$USE_METRICS" != "true" ]; then
224+
echo "Metrics disabled, skipping vmagent initialization" >&2
225+
else
226+
for i in "${!nodes[@]}"; do
227+
node="${nodes[$i]}"
228+
if [ "$i" -eq 0 ]; then
229+
if [ "$USE_DCGM_EXPORTER" = "true" ]; then
230+
echo "dcgm-exporter: /dev/nvidia0 exists=$([ -e /dev/nvidia0 ] && echo yes || echo no), binary=$DCGM_EXPORTER_BIN executable=$([ -x "$DCGM_EXPORTER_BIN" ] && echo yes || echo no)" >&2
231+
if [ -e /dev/nvidia0 ] && [ -x "$DCGM_EXPORTER_BIN" ]; then
232+
"$DCGM_EXPORTER_BIN" $DCGM_COMMON_ARGS > $DCGM_LOG 2>&1 &
233+
else
234+
echo "dcgm-exporter: no NVIDIA GPU or binary not found, skipping" >&2
235+
fi
236+
VMAGENT_SCRAPE_CONFIG="$METRICS_CONFIG_DIR/vmagent-scrape.yaml"
237+
else
238+
echo "dcgm-exporter: disabled (USE_DCGM_EXPORTER != true), skipping" >&2
239+
VMAGENT_SCRAPE_CONFIG="$METRICS_CONFIG_DIR/vmagent-scrape-no-dcgm.yaml"
240+
fi
241+
"$METRICS_AGENT_BIN" $VMAGENT_COMMON_ARGS \
242+
-promscrape.config=$VMAGENT_SCRAPE_CONFIG \
243+
-remoteWrite.label="node=$(hostname)" \
244+
-remoteWrite.tmpDataPath=$VMAGENT_DATA \
245+
> $VMAGENT_LOG 2>&1 &
246+
else
247+
srun --nodes=1 --ntasks=1 --nodelist=$node --overlap \
248+
bash -c "
249+
if [ \"$USE_DCGM_EXPORTER\" = 'true' ]; then
250+
if [ -e /dev/nvidia0 ] && [ -x \"$DCGM_EXPORTER_BIN\" ]; then
251+
\"$DCGM_EXPORTER_BIN\" $DCGM_COMMON_ARGS > $DCGM_LOG 2>&1 &
252+
else
253+
echo 'dcgm-exporter: no NVIDIA GPU or binary not found, skipping' >&2
254+
fi
255+
\"$METRICS_AGENT_BIN\" $VMAGENT_COMMON_ARGS \
256+
-promscrape.config=$METRICS_CONFIG_DIR/vmagent-scrape-dcgm-only.yaml \
257+
-remoteWrite.label=\"node=\$(hostname)\" \
258+
-remoteWrite.tmpDataPath=$VMAGENT_DATA \
259+
> $VMAGENT_LOG 2>&1 &
260+
else
261+
echo 'dcgm-exporter: disabled (USE_DCGM_EXPORTER != true), skipping vmagent on worker node' >&2
262+
fi
263+
wait
264+
" &
265+
fi
266+
done
267+
fi
268+
else
217269
echo "metrics: $METRICS_AGENT_BIN not found, skipping push" >&2
218270
fi
219271

src/swiss_ai_model_launch/cli/main.py

Lines changed: 14 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -246,6 +246,18 @@ def _build_parser() -> argparse.ArgumentParser:
246246
action="store_true",
247247
help="Disable OCF.",
248248
)
249+
advanced_parser.add_argument(
250+
"--disable-dcgm-exporter",
251+
dest="disable_dcgm_exporter",
252+
action="store_true",
253+
help="Disable the DCGM exporter.",
254+
)
255+
advanced_parser.add_argument(
256+
"--disable-metrics",
257+
dest="disable_metrics",
258+
action="store_true",
259+
help="Disable metrics collection.",
260+
)
249261
advanced_parser.add_argument(
250262
"--pre-launch-cmds",
251263
dest="pre_launch_cmds",
@@ -550,6 +562,8 @@ async def _run_advanced(args: argparse.Namespace) -> None:
550562
use_router=args.use_router,
551563
router_args=args.router_args,
552564
disable_ocf=args.disable_ocf,
565+
disable_dcgm_exporter=args.disable_dcgm_exporter,
566+
disable_metrics=args.disable_metrics,
553567
telemetry_endpoint=config.get_value("telemetry_endpoint"),
554568
)
555569

src/swiss_ai_model_launch/launchers/launch_args.py

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -25,6 +25,9 @@ class LaunchArgs(BaseModel):
2525
telemetry_endpoint: str | None = None
2626
metrics_remote_write_url: str = "https://prometheus-dev.swissai.svc.cscs.ch/api/v1/write"
2727
metrics_agent_binary: str = "/capstor/store/cscs/swissai/infra01/ocf-share/vmagent"
28+
dcgm_exporter_binary: str = "/capstor/store/cscs/swissai/infra01/ocf-share/dcgm-exporter"
29+
disable_dcgm_exporter: bool = False
30+
disable_metrics: bool = False
2831

2932
@model_validator(mode="after")
3033
def set_defaults(self) -> "LaunchArgs":

0 commit comments

Comments
 (0)