Skip to content

Commit 74c914f

Browse files
committed
add dcgm support and mount libs
1 parent bcd1ec6 commit 74c914f

8 files changed

Lines changed: 125 additions & 20 deletions

File tree

coman/.config/config.toml

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -60,6 +60,7 @@ com.hooks.ssh.enabled = "true"
6060
com.hooks.ssh.authorize_ssh_key = "{{ ssh_public_key }}"
6161
com.hooks.ssh.port = "15263"
6262
{% endif %}
63+
com.hooks.dcgm.enabled = "true"
6364
"""
6465

6566
# set environment variables that should be passed to a job

coman/src/cli/app.rs

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -408,7 +408,7 @@ fn remote_path_completer(current: &std::ffi::OsStr) -> Vec<CompletionCandidate>
408408
let (send, mut recv) = mpsc::unbounded_channel();
409409
if current.is_empty() || current == "/" {
410410
tokio::spawn(async move {
411-
let roots = file_system_roots().await;
411+
let roots = file_system_roots(None).await;
412412
if let Ok(roots) = roots {
413413
for root in roots {
414414
send.send(CompletionCandidate::new(root.name.clone())).unwrap();

coman/src/cli/rpc.rs

Lines changed: 13 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -2,7 +2,7 @@ use futures::StreamExt;
22
use iroh::protocol::ProtocolHandler;
33
use nvml_wrapper::Nvml;
44
use serde::{Deserialize, Serialize};
5-
use sysinfo::System;
5+
use sysinfo::{System, get_current_pid};
66
use tarpc::{
77
serde_transport as transport, server, server::Channel, tokio_serde::formats::Bincode,
88
tokio_util::codec::LengthDelimitedCodec,
@@ -11,11 +11,11 @@ use tokio_duplex::Duplex;
1111

1212
use crate::cli::app::COMAN_VERSION;
1313

14-
#[derive(Debug, Clone, Serialize, Deserialize)]
14+
#[derive(Debug, Default, Clone, Serialize, Deserialize)]
1515
pub struct ResourceUsage {
1616
pub cpu: f32,
17-
pub mem_used: u64,
18-
pub mem_total: u64,
17+
pub rss: u64,
18+
pub vss: u64,
1919
pub gpu: Option<u64>,
2020
}
2121

@@ -35,11 +35,12 @@ impl ComanRPC for RpcServer {
3535
async fn resource_usage(self, _context: ::tarpc::context::Context) -> ResourceUsage {
3636
let mut sys = System::new_all();
3737
sys.refresh_all();
38-
let mut cpu_usage = 0.0;
39-
for cpu in sys.cpus() {
40-
cpu_usage += cpu.cpu_usage();
41-
}
42-
cpu_usage /= sys.cpus().len() as f32;
38+
let Ok(pid) = get_current_pid() else {
39+
return ResourceUsage::default();
40+
};
41+
let Some(process) = sys.process(pid) else {
42+
return ResourceUsage::default();
43+
};
4344
let gpu_usage = match Nvml::init() {
4445
Ok(nvml) => match nvml.device_by_index(0) {
4546
Ok(device) => match device.memory_info() {
@@ -60,9 +61,9 @@ impl ComanRPC for RpcServer {
6061
}
6162
};
6263
ResourceUsage {
63-
cpu: cpu_usage,
64-
mem_used: sys.used_memory(),
65-
mem_total: sys.total_memory(),
64+
cpu: process.cpu_usage(),
65+
rss: process.memory(),
66+
vss: process.virtual_memory(),
6667
gpu: gpu_usage,
6768
}
6869
}

coman/src/cscs/cli.rs

Lines changed: 11 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -4,6 +4,7 @@ use std::{
44
time::{Duration, Instant},
55
};
66

7+
use bytesize::ByteSize;
78
use color_eyre::{Result, eyre::Context};
89
use eyre::eyre;
910
use futures::StreamExt;
@@ -142,8 +143,16 @@ pub(crate) async fn cli_cscs_job_resource_usage(
142143
platform: Option<ComputePlatform>,
143144
) -> Result<()> {
144145
let job_id = maybe_job_id_from_name(job, system.clone(), platform.clone()).await?;
145-
println!("running port forward for job {job_id}");
146-
cscs_resource_usage(job_id, system).await
146+
let result = cscs_resource_usage(job_id, system).await?;
147+
println!(
148+
"CPU: {}, Memory: RSS {} VSS {}, GPU: {}",
149+
result.cpu,
150+
ByteSize::b(result.rss).display().iec().to_string(),
151+
ByteSize::b(result.vss).display().iec().to_string(),
152+
result.gpu.map(|g| g.to_string()).unwrap_or("N/A".to_string())
153+
);
154+
155+
Ok(())
147156
}
148157

149158
#[allow(clippy::too_many_arguments)]

coman/src/cscs/dcgm_enroot_hook.sh

Lines changed: 60 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,60 @@
1+
#!/usr/bin/env bash
2+
3+
set -euo pipefail
4+
shopt -s lastpipe nullglob
5+
6+
export PATH=\"${PATH}:/usr/sbin:/sbin\"
7+
8+
source \"${ENROOT_LIBRARY_PATH}/common.sh\"
9+
10+
common::checkcmd grep sed ldd ldconfig
11+
12+
if [ \"${OCI_ANNOTATION_com__hooks__dcgm__enabled:-}\" != \"true\" ]; then
13+
exit 0
14+
fi
15+
16+
# Mounting the specified DCGM libraries and directories explicitly
17+
cat << EOF | enroot-mount --root \"${ENROOT_ROOTFS}\" -
18+
/usr/local/dcgm /usr/local/dcgm none x-create=dir,bind,ro,nosuid,nodev,private
19+
/usr/lib64/libnvperf_dcgm_host.so /usr/lib64/libnvperf_dcgm_host.so none x-create=file,bind,ro,nosuid,nodev,private
20+
/usr/lib64/libdcgmmodulesysmon.so.3.3.6 /usr/lib64/libdcgmmodulesysmon.so.3.3.6 none x-create=file,bind,ro,nosuid,nodev,private
21+
/usr/lib64/libdcgmmodulesysmon.so.3 /usr/lib64/libdcgmmodulesysmon.so.3 none x-create=file,bind,ro,nosuid,nodev,private
22+
/usr/lib64/libdcgmmodulesysmon.so /usr/lib64/libdcgmmodulesysmon.so none x-create=file,bind,ro,nosuid,nodev,private
23+
/usr/lib64/libdcgmmoduleprofiling.so.3.3.6 /usr/lib64/libdcgmmoduleprofiling.so.3.3.6 none x-create=file,bind,ro,nosuid,nodev,private
24+
/usr/lib64/libdcgmmoduleprofiling.so.3 /usr/lib64/libdcgmmoduleprofiling.so.3 none x-create=file,bind,ro,nosuid,nodev,private
25+
/usr/lib64/libdcgmmoduleprofiling.so /usr/lib64/libdcgmmoduleprofiling.so none x-create=file,bind,ro,nosuid,nodev,private
26+
/usr/lib64/libdcgmmodulepolicy.so.3.3.6 /usr/lib64/libdcgmmodulepolicy.so.3.3.6 none x-create=file,bind,ro,nosuid,nodev,private
27+
/usr/lib64/libdcgmmodulepolicy.so.3 /usr/lib64/libdcgmmodulepolicy.so.3 none x-create=file,bind,ro,nosuid,nodev,private
28+
/usr/lib64/libdcgmmodulepolicy.so /usr/lib64/libdcgmmodulepolicy.so none x-create=file,bind,ro,nosuid,nodev,private
29+
/usr/lib64/libdcgmmodulenvswitch.so.3.3.6 /usr/lib64/libdcgmmodulenvswitch.so.3.3.6 none x-create=file,bind,ro,nosuid,nodev,private
30+
/usr/lib64/libdcgmmodulenvswitch.so.3 /usr/lib64/libdcgmmodulenvswitch.so.3 none x-create=file,bind,ro,nosuid,nodev,private
31+
/usr/lib64/libdcgmmodulenvswitch.so /usr/lib64/libdcgmmodulenvswitch.so none x-create=file,bind,ro,nosuid,nodev,private
32+
/usr/lib64/libdcgmmoduleintrospect.so.3.3.6 /usr/lib64/libdcgmmoduleintrospect.so.3.3.6 none x-create=file,bind,ro,nosuid,nodev,private
33+
/usr/lib64/libdcgmmoduleintrospect.so.3 /usr/lib64/libdcgmmoduleintrospect.so.3 none x-create=file,bind,ro,nosuid,nodev,private
34+
/usr/lib64/libdcgmmoduleintrospect.so /usr/lib64/libdcgmmoduleintrospect.so none x-create=file,bind,ro,nosuid,nodev,private
35+
/usr/lib64/libdcgmmodulehealth.so.3.3.6 /usr/lib64/libdcgmmodulehealth.so.3.3.6 none x-create=file,bind,ro,nosuid,nodev,private
36+
/usr/lib64/libdcgmmodulehealth.so.3 /usr/lib64/libdcgmmodulehealth.so.3 none x-create=file,bind,ro,nosuid,nodev,private
37+
/usr/lib64/libdcgmmodulehealth.so /usr/lib64/libdcgmmodulehealth.so none x-create=file,bind,ro,nosuid,nodev,private
38+
/usr/lib64/libdcgmmodulediag.so.3.3.6 /usr/lib64/libdcgmmodulediag.so.3.3.6 none x-create=file,bind,ro,nosuid,nodev,private
39+
/usr/lib64/libdcgmmodulediag.so.3 /usr/lib64/libdcgmmodulediag.so.3 none x-create=file,bind,ro,nosuid,nodev,private
40+
/usr/lib64/libdcgmmodulediag.so /usr/lib64/libdcgmmodulediag.so none x-create=file,bind,ro,nosuid,nodev,private
41+
/usr/lib64/libdcgmmoduleconfig.so.3.3.6 /usr/lib64/libdcgmmoduleconfig.so.3.3.6 none x-create=file,bind,ro,nosuid,nodev,private
42+
/usr/lib64/libdcgmmoduleconfig.so.3 /usr/lib64/libdcgmmoduleconfig.so.3 none x-create=file,bind,ro,nosuid,nodev,private
43+
/usr/lib64/libdcgmmoduleconfig.so /usr/lib64/libdcgmmoduleconfig.so none x-create=file,bind,ro,nosuid,nodev,private
44+
/usr/lib64/libdcgm_stub.a /usr/lib64/libdcgm_stub.a none x-create=file,bind,ro,nosuid,nodev,private
45+
/usr/lib64/libdcgm_cublas_proxy12.so /usr/lib64/libdcgm_cublas_proxy12.so none x-create=file,bind,ro,nosuid,nodev,private
46+
/usr/lib64/libdcgm_cublas_proxy11.so /usr/lib64/libdcgm_cublas_proxy11.so none x-create=file,bind,ro,nosuid,nodev,private
47+
/usr/lib64/libdcgm.so.3.3.6 /usr/lib64/libdcgm.so.3.3.6 none x-create=file,bind,ro,nosuid,nodev,private
48+
/usr/lib64/libdcgm.so.3 /usr/lib64/libdcgm.so.3 none x-create=file,bind,ro,nosuid,nodev,private
49+
/usr/lib64/libdcgm.so /usr/lib64/libdcgm.so none x-create=file,bind,ro,nosuid,nodev,private
50+
EOF
51+
52+
# Refresh the dynamic linker cache to include newly mounted libs
53+
cat << EOF > \"${ENROOT_ROOTFS}/etc/ld.so.conf.d/enroot-dcgm-hook.conf\"
54+
/lib64
55+
/usr/lib64
56+
EOF
57+
58+
if ! ${ldconfig:-ldconfig} -r \"${ENROOT_ROOTFS}\" >> \"${ENROOT_ROOTFS}/dcgm-hook.log\" 2>&1; then
59+
common::err \"Failed to refresh the dynamic linker cache\"
60+
fi\n"

coman/src/cscs/handlers.rs

Lines changed: 37 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -52,6 +52,7 @@ use crate::{
5252
};
5353

5454
const CSCS_MAX_DIRECT_SIZE: usize = 5242880;
55+
const DCGM_ENROOT_HOOK: &str = include_str!("./dcgm_enroot_hook.sh");
5556

5657
async fn get_access_token() -> Result<Secret> {
5758
let client_id = match get_secret(CLIENT_ID_SECRET_NAME).await {
@@ -750,6 +751,27 @@ async fn handle_script(
750751

751752
Ok(script_path)
752753
}
754+
async fn setup_dcgm_hook(api_client: &CscsApi, current_system: &str) -> Result<()> {
755+
let user_dirs = file_system_roots(Some(FileSystemType::Users)).await?;
756+
let user_dir = user_dirs
757+
.first()
758+
.ok_or(eyre!("couldn't find user root directory on remote"))?;
759+
let path = PathBuf::from(user_dir.name.clone())
760+
.join(".config")
761+
.join("enroot")
762+
.join("hooks.d")
763+
.join("cscs_jobreport_dcgm_hook.sh");
764+
765+
let response = api_client.checksum(current_system, path.clone()).await;
766+
if let Ok(Some(_)) = response {
767+
// file exists
768+
return Ok(());
769+
}
770+
api_client
771+
.upload(current_system, path, DCGM_ENROOT_HOOK.as_bytes().to_vec())
772+
.await
773+
.wrap_err(format!("couldn't upload dcgm enroot hook "))
774+
}
753775

754776
pub async fn cscs_job_start(
755777
name: Option<String>,
@@ -807,7 +829,9 @@ pub async fn cscs_job_start(
807829
if coman_squash.is_none() {
808830
println!("Warning: coman squash wasn't templated and is needed for ssh through coman to work");
809831
}
810-
832+
if let Err(e) = setup_dcgm_hook(&api_client, &current_system).await {
833+
println!("Warning: couldn't set up dcgm hook: {e:?}");
834+
}
811835
let environment_path = handle_edf(
812836
&api_client,
813837
&base_path,
@@ -869,7 +893,7 @@ pub async fn cscs_file_list(
869893
Err(e) => Err(e),
870894
}
871895
}
872-
pub async fn file_system_roots() -> Result<Vec<PathEntry>> {
896+
pub async fn file_system_roots(type_filter: Option<FileSystemType>) -> Result<Vec<PathEntry>> {
873897
let config = Config::new().expect("couldn't load config");
874898
let user_info = cscs_user_info(None, None).await?;
875899
let systems = cscs_system_list(None).await?;
@@ -878,7 +902,17 @@ pub async fn file_system_roots() -> Result<Vec<PathEntry>> {
878902
.find(|s| s.name == config.values.cscs.current_system)
879903
.unwrap_or_else(|| panic!("couldn't get info for system {}", config.values.cscs.current_system));
880904
let mut subpaths = vec![];
881-
for fs in system.file_systems.clone() {
905+
let filesystems = if let Some(filter) = type_filter {
906+
system
907+
.file_systems
908+
.clone()
909+
.into_iter()
910+
.filter(|fs| fs.data_type == filter)
911+
.collect()
912+
} else {
913+
system.file_systems.clone()
914+
};
915+
for fs in filesystems {
882916
let entry = match cscs_stat_path(PathBuf::from(fs.path.clone()).join(user_info.name.clone()), None, None).await
883917
{
884918
Ok(Some(_)) => PathEntry {

coman/src/cscs/ports.rs

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -248,7 +248,7 @@ async fn list_files(id: PathBuf) -> Result<Option<Event<UserEvent>>> {
248248
.map_err(|_| eyre!("couldn't convert id to string".to_owned()))?;
249249
if id_str == "/" {
250250
// load file system roots
251-
let subpaths = file_system_roots().await?;
251+
let subpaths = file_system_roots(None).await?;
252252
Ok(Some(Event::User(UserEvent::File(FileEvent::List(id_str, subpaths)))))
253253
} else {
254254
let subpaths = cscs_file_list(id, None, None).await?;

firecrest_client/src/filesystem_api.rs

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -21,7 +21,7 @@ pub async fn get_filesystem_ops_ls(
2121
let response = client
2222
.get(
2323
format!("filesystem/{system_name}/ops/ls").as_str(),
24-
Some(vec![("path", path)]),
24+
Some(vec![("path", path), ("showHidden", "false")]),
2525
)
2626
.await?;
2727
let model: GetDirectoryLsResponse = serde_json::from_str(response.as_str())?;

0 commit comments

Comments
 (0)