File tree Expand file tree Collapse file tree
src/swiss_ai_model_launch/assets Expand file tree Collapse file tree Original file line number Diff line number Diff line change @@ -207,13 +207,7 @@ done
207207# The batch node (index 0) runs directly; worker nodes run via srun --overlap.
208208# The batch node scrapes framework metrics (8080) + DCGM (9400); workers scrape only DCGM (9400).
209209if [ -n " $METRICS_REMOTE_WRITE_URL " ] && [ -x " $METRICS_AGENT_BIN " ]; then
210- VMAGENT_COMMON_ARGS="
211- -remoteWrite.url=${METRICS_REMOTE_WRITE_URL}
212- -remoteWrite.label=slurm_job_id=${SLURM_JOB_ID}
213- -remoteWrite.label=model=${SERVED_MODEL_NAME}
214- -remoteWrite.label=framework=${FRAMEWORK}
215- -remoteWrite.label=user=${USER}
216- "
210+ VMAGENT_COMMON_ARGS=" -remoteWrite.url=${METRICS_REMOTE_WRITE_URL} -remoteWrite.label=slurm_job_id=${SLURM_JOB_ID} -remoteWrite.label=model=${SERVED_MODEL_NAME} -remoteWrite.label=framework=${FRAMEWORK} -remoteWrite.label=user=${USER} "
217211 METRICS_CONFIG_DIR=" /capstor/store/cscs/swissai/infra01/ocf-share"
218212 DCGM_COMMON_ARGS=" --address 0.0.0.0:9400 -f $METRICS_CONFIG_DIR /default-counters.csv"
219213 DCGM_LOG=" /tmp/dcgm-exporter-${SLURM_JOB_ID} .log"
You can’t perform that action at this time.
0 commit comments