Skip to content

Commit fc6a19f

Browse files
committed
add dcgm support and mount libs
1 parent bcd1ec6 commit fc6a19f

4 files changed

Lines changed: 96 additions & 3 deletions

File tree

coman/.config/config.toml

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -60,6 +60,7 @@ com.hooks.ssh.enabled = "true"
6060
com.hooks.ssh.authorize_ssh_key = "{{ ssh_public_key }}"
6161
com.hooks.ssh.port = "15263"
6262
{% endif %}
63+
com.hooks.dcgm.enabled = "true"
6364
"""
6465

6566
# set environment variables that should be passed to a job

coman/src/cli/dcgm_enroot_hook.sh

Lines changed: 60 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,60 @@
1+
#!/usr/bin/env bash
2+
3+
set -euo pipefail
4+
shopt -s lastpipe nullglob
5+
6+
export PATH=\"${PATH}:/usr/sbin:/sbin\"
7+
8+
source \"${ENROOT_LIBRARY_PATH}/common.sh\"
9+
10+
common::checkcmd grep sed ldd ldconfig
11+
12+
if [ \"${OCI_ANNOTATION_com__hooks__dcgm__enabled:-}\" != \"true\" ]; then
13+
exit 0
14+
fi
15+
16+
# Mounting the specified DCGM libraries and directories explicitly
17+
cat << EOF | enroot-mount --root \"${ENROOT_ROOTFS}\" -
18+
/usr/local/dcgm /usr/local/dcgm none x-create=dir,bind,ro,nosuid,nodev,private
19+
/usr/lib64/libnvperf_dcgm_host.so /usr/lib64/libnvperf_dcgm_host.so none x-create=file,bind,ro,nosuid,nodev,private
20+
/usr/lib64/libdcgmmodulesysmon.so.3.3.6 /usr/lib64/libdcgmmodulesysmon.so.3.3.6 none x-create=file,bind,ro,nosuid,nodev,private
21+
/usr/lib64/libdcgmmodulesysmon.so.3 /usr/lib64/libdcgmmodulesysmon.so.3 none x-create=file,bind,ro,nosuid,nodev,private
22+
/usr/lib64/libdcgmmodulesysmon.so /usr/lib64/libdcgmmodulesysmon.so none x-create=file,bind,ro,nosuid,nodev,private
23+
/usr/lib64/libdcgmmoduleprofiling.so.3.3.6 /usr/lib64/libdcgmmoduleprofiling.so.3.3.6 none x-create=file,bind,ro,nosuid,nodev,private
24+
/usr/lib64/libdcgmmoduleprofiling.so.3 /usr/lib64/libdcgmmoduleprofiling.so.3 none x-create=file,bind,ro,nosuid,nodev,private
25+
/usr/lib64/libdcgmmoduleprofiling.so /usr/lib64/libdcgmmoduleprofiling.so none x-create=file,bind,ro,nosuid,nodev,private
26+
/usr/lib64/libdcgmmodulepolicy.so.3.3.6 /usr/lib64/libdcgmmodulepolicy.so.3.3.6 none x-create=file,bind,ro,nosuid,nodev,private
27+
/usr/lib64/libdcgmmodulepolicy.so.3 /usr/lib64/libdcgmmodulepolicy.so.3 none x-create=file,bind,ro,nosuid,nodev,private
28+
/usr/lib64/libdcgmmodulepolicy.so /usr/lib64/libdcgmmodulepolicy.so none x-create=file,bind,ro,nosuid,nodev,private
29+
/usr/lib64/libdcgmmodulenvswitch.so.3.3.6 /usr/lib64/libdcgmmodulenvswitch.so.3.3.6 none x-create=file,bind,ro,nosuid,nodev,private
30+
/usr/lib64/libdcgmmodulenvswitch.so.3 /usr/lib64/libdcgmmodulenvswitch.so.3 none x-create=file,bind,ro,nosuid,nodev,private
31+
/usr/lib64/libdcgmmodulenvswitch.so /usr/lib64/libdcgmmodulenvswitch.so none x-create=file,bind,ro,nosuid,nodev,private
32+
/usr/lib64/libdcgmmoduleintrospect.so.3.3.6 /usr/lib64/libdcgmmoduleintrospect.so.3.3.6 none x-create=file,bind,ro,nosuid,nodev,private
33+
/usr/lib64/libdcgmmoduleintrospect.so.3 /usr/lib64/libdcgmmoduleintrospect.so.3 none x-create=file,bind,ro,nosuid,nodev,private
34+
/usr/lib64/libdcgmmoduleintrospect.so /usr/lib64/libdcgmmoduleintrospect.so none x-create=file,bind,ro,nosuid,nodev,private
35+
/usr/lib64/libdcgmmodulehealth.so.3.3.6 /usr/lib64/libdcgmmodulehealth.so.3.3.6 none x-create=file,bind,ro,nosuid,nodev,private
36+
/usr/lib64/libdcgmmodulehealth.so.3 /usr/lib64/libdcgmmodulehealth.so.3 none x-create=file,bind,ro,nosuid,nodev,private
37+
/usr/lib64/libdcgmmodulehealth.so /usr/lib64/libdcgmmodulehealth.so none x-create=file,bind,ro,nosuid,nodev,private
38+
/usr/lib64/libdcgmmodulediag.so.3.3.6 /usr/lib64/libdcgmmodulediag.so.3.3.6 none x-create=file,bind,ro,nosuid,nodev,private
39+
/usr/lib64/libdcgmmodulediag.so.3 /usr/lib64/libdcgmmodulediag.so.3 none x-create=file,bind,ro,nosuid,nodev,private
40+
/usr/lib64/libdcgmmodulediag.so /usr/lib64/libdcgmmodulediag.so none x-create=file,bind,ro,nosuid,nodev,private
41+
/usr/lib64/libdcgmmoduleconfig.so.3.3.6 /usr/lib64/libdcgmmoduleconfig.so.3.3.6 none x-create=file,bind,ro,nosuid,nodev,private
42+
/usr/lib64/libdcgmmoduleconfig.so.3 /usr/lib64/libdcgmmoduleconfig.so.3 none x-create=file,bind,ro,nosuid,nodev,private
43+
/usr/lib64/libdcgmmoduleconfig.so /usr/lib64/libdcgmmoduleconfig.so none x-create=file,bind,ro,nosuid,nodev,private
44+
/usr/lib64/libdcgm_stub.a /usr/lib64/libdcgm_stub.a none x-create=file,bind,ro,nosuid,nodev,private
45+
/usr/lib64/libdcgm_cublas_proxy12.so /usr/lib64/libdcgm_cublas_proxy12.so none x-create=file,bind,ro,nosuid,nodev,private
46+
/usr/lib64/libdcgm_cublas_proxy11.so /usr/lib64/libdcgm_cublas_proxy11.so none x-create=file,bind,ro,nosuid,nodev,private
47+
/usr/lib64/libdcgm.so.3.3.6 /usr/lib64/libdcgm.so.3.3.6 none x-create=file,bind,ro,nosuid,nodev,private
48+
/usr/lib64/libdcgm.so.3 /usr/lib64/libdcgm.so.3 none x-create=file,bind,ro,nosuid,nodev,private
49+
/usr/lib64/libdcgm.so /usr/lib64/libdcgm.so none x-create=file,bind,ro,nosuid,nodev,private
50+
EOF
51+
52+
# Refresh the dynamic linker cache to include newly mounted libs
53+
cat << EOF > \"${ENROOT_ROOTFS}/etc/ld.so.conf.d/enroot-dcgm-hook.conf\"
54+
/lib64
55+
/usr/lib64
56+
EOF
57+
58+
if ! ${ldconfig:-ldconfig} -r \"${ENROOT_ROOTFS}\" >> \"${ENROOT_ROOTFS}/dcgm-hook.log\" 2>&1; then
59+
common::err \"Failed to refresh the dynamic linker cache\"
60+
fi\n"

coman/src/cli/exec.rs

Lines changed: 24 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,10 @@
11
use std::{thread, time::Duration};
22

33
use base64::prelude::*;
4-
use color_eyre::Result;
4+
use color_eyre::{
5+
Result,
6+
eyre::{WrapErr, eyre},
7+
};
58
use iroh::{
69
Endpoint, SecretKey,
710
endpoint::ConnectionError,
@@ -16,6 +19,7 @@ use crate::cli::rpc::RpcHandler;
1619
const SECRET_KEY_ENV: &str = "COMAN_IROH_SECRET";
1720
const PORT_FORWARD_ENV: &str = "COMAN_FORWARDED_PORTS";
1821
const SSH_PORT: u16 = 15263;
22+
const DCGM_ENROOT_HOOK: &str = include_str!("./dcgm_enroot_hook.sh");
1923

2024
fn get_secret_key() -> Option<Vec<u8>> {
2125
if let Ok(secret) = std::env::var(SECRET_KEY_ENV) {
@@ -144,6 +148,9 @@ pub(crate) async fn cli_exec_command(command: Vec<String>) -> Result<()> {
144148
.launch()
145149
.expect("Launch failed");
146150

151+
if let Err(e) = setup_dcgm_hook().await {
152+
println!("couldn't set up dcgm hook, resource usage monitoring might not work: {e:?}");
153+
}
147154
let mut supervisor = Supervisor::new(SupervisorConfig::default());
148155
supervisor.add_process("port-forward", ChildType::Permanent, || {
149156
thread::spawn(|| {
@@ -173,3 +180,19 @@ pub(crate) async fn cli_exec_command(command: Vec<String>) -> Result<()> {
173180
}
174181
Ok(())
175182
}
183+
184+
async fn setup_dcgm_hook() -> Result<()> {
185+
let user_dirs = directories::UserDirs::new().ok_or(eyre!("couldn't find home dir"))?;
186+
let user_home = user_dirs.home_dir();
187+
let dcgm_hook_path = user_home
188+
.join(".config")
189+
.join("enroot")
190+
.join("hooks.d")
191+
.join("cscs_jobreport_dcgm_hook.sh");
192+
if dcgm_hook_path.exists() {
193+
return Ok(());
194+
}
195+
std::fs::create_dir_all(dcgm_hook_path.parent().unwrap())?;
196+
std::fs::write(&dcgm_hook_path, DCGM_ENROOT_HOOK)
197+
.wrap_err(format!("couldn't dcgm enroot hook {}", dcgm_hook_path.display()))
198+
}

coman/src/cscs/cli.rs

Lines changed: 11 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -4,6 +4,7 @@ use std::{
44
time::{Duration, Instant},
55
};
66

7+
use bytesize::ByteSize;
78
use color_eyre::{Result, eyre::Context};
89
use eyre::eyre;
910
use futures::StreamExt;
@@ -142,8 +143,16 @@ pub(crate) async fn cli_cscs_job_resource_usage(
142143
platform: Option<ComputePlatform>,
143144
) -> Result<()> {
144145
let job_id = maybe_job_id_from_name(job, system.clone(), platform.clone()).await?;
145-
println!("running port forward for job {job_id}");
146-
cscs_resource_usage(job_id, system).await
146+
let result = cscs_resource_usage(job_id, system).await?;
147+
println!(
148+
"CPU: {}, Memory: {}/{}, GPU: {}",
149+
result.cpu,
150+
ByteSize::b(result.mem_used).display().iec().to_string(),
151+
ByteSize::b(result.mem_total).display().iec().to_string(),
152+
result.gpu.map(|g| g.to_string()).unwrap_or("N/A".to_string())
153+
);
154+
155+
Ok(())
147156
}
148157

149158
#[allow(clippy::too_many_arguments)]

0 commit comments

Comments
 (0)