Skip to content

Commit 9995730

Browse files
committed
switch to nvidia-smi
1 parent 43de006 commit 9995730

6 files changed

Lines changed: 37 additions & 156 deletions

File tree

Cargo.lock

Lines changed: 0 additions & 46 deletions
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

coman/Cargo.toml

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -91,7 +91,6 @@ tarpc = { version = "0.37.0", features = [
9191
] }
9292
tokio-duplex = "1.0.1"
9393
sysinfo = "0.38.0"
94-
nvml-wrapper = "0.11.0"
9594
bytesize = "2.3.1"
9695

9796
[build-dependencies]

coman/src/cli/rpc.rs

Lines changed: 25 additions & 20 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,6 @@
1+
use bytesize::ByteSize;
12
use futures::StreamExt;
23
use iroh::protocol::ProtocolHandler;
3-
use nvml_wrapper::Nvml;
44
use serde::{Deserialize, Serialize};
55
use sysinfo::{ProcessRefreshKind, ProcessesToUpdate, System, get_current_pid};
66
use tarpc::{
@@ -16,7 +16,7 @@ pub struct ResourceUsage {
1616
pub cpu: f32,
1717
pub rss: u64,
1818
pub vss: u64,
19-
pub gpu: Option<u64>,
19+
pub gpu: Option<Vec<(u64, u64)>>,
2020
}
2121

2222
#[tarpc::service]
@@ -43,25 +43,30 @@ impl ComanRPC for RpcServer {
4343
let Some(process) = sys.process(pid) else {
4444
return ResourceUsage::default();
4545
};
46-
let gpu_usage = match Nvml::init() {
47-
Ok(nvml) => match nvml.device_by_index(0) {
48-
Ok(device) => match device.memory_info() {
49-
Ok(memory_info) => Some(memory_info.used),
50-
Err(e) => {
51-
println!("Couldn't get GPU memory info: {e:?}");
52-
None
53-
}
54-
},
55-
Err(e) => {
56-
println!("couldn't load nvidia device 0: {e:?}");
57-
None
58-
}
59-
},
60-
Err(e) => {
61-
println!("Nvidia Device Info not available: {e:?}");
62-
None
63-
}
46+
let gpu_usage = if let Ok(output) = std::process::Command::new("nvidia-smi")
47+
.args(vec![
48+
"--query-gpu=memory.total,memory.used",
49+
"--format=csv,noheader,nounits",
50+
])
51+
.output()
52+
{
53+
let output = String::from_utf8_lossy(&output.stdout);
54+
let usage = output
55+
.lines()
56+
.map(|l| l.split_once(",").unwrap())
57+
.map(|(total, used)| {
58+
(
59+
ByteSize::mib(total.trim().parse::<u64>().unwrap()).as_u64(),
60+
ByteSize::mib(used.trim().parse::<u64>().unwrap()).as_u64(),
61+
)
62+
})
63+
.collect();
64+
Some(usage)
65+
} else {
66+
println!("Failed to execute nvidia-smi, maybe it's not installed");
67+
None
6468
};
69+
6570
ResourceUsage {
6671
cpu: process.cpu_usage() / sys.cpus().len() as f32,
6772
rss: process.memory(),

coman/src/cscs/cli.rs

Lines changed: 12 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -149,7 +149,18 @@ pub(crate) async fn cli_cscs_job_resource_usage(
149149
result.cpu,
150150
ByteSize::b(result.rss).display().iec(),
151151
ByteSize::b(result.vss).display().iec(),
152-
result.gpu.map(|g| g.to_string()).unwrap_or("N/A".to_string())
152+
result
153+
.gpu
154+
.map(|g| g
155+
.into_iter()
156+
.map(|(total, used)| format!(
157+
"{}/{}({:.1}%)",
158+
ByteSize::b(used).display().iec(),
159+
ByteSize::b(total).display().iec(),
160+
used as f64 / total as f64 * 100.0
161+
))
162+
.join(", "))
163+
.unwrap_or("N/A".to_string())
153164
);
154165

155166
Ok(())

coman/src/cscs/dcgm_enroot_hook.sh

Lines changed: 0 additions & 60 deletions
This file was deleted.

coman/src/cscs/handlers.rs

Lines changed: 0 additions & 28 deletions
Original file line numberDiff line numberDiff line change
@@ -52,7 +52,6 @@ use crate::{
5252
};
5353

5454
const CSCS_MAX_DIRECT_SIZE: usize = 5242880;
55-
const DCGM_ENROOT_HOOK: &str = include_str!("./dcgm_enroot_hook.sh");
5655

5756
async fn get_access_token() -> Result<Secret> {
5857
let client_id = match get_secret(CLIENT_ID_SECRET_NAME).await {
@@ -751,30 +750,6 @@ async fn handle_script(
751750

752751
Ok(script_path)
753752
}
754-
async fn setup_dcgm_hook(api_client: &CscsApi, current_system: &str) -> Result<()> {
755-
let user_dirs = file_system_roots(Some(FileSystemType::Users)).await?;
756-
let user_dir = user_dirs
757-
.first()
758-
.ok_or(eyre!("couldn't find user root directory on remote"))?;
759-
let path = PathBuf::from(user_dir.name.clone())
760-
.join(".config")
761-
.join("enroot")
762-
.join("hooks.d")
763-
.join("cscs_jobreport_dcgm_hook.sh");
764-
765-
let response = api_client.checksum(current_system, path.clone()).await;
766-
if let Ok(Some(_)) = response {
767-
// file exists
768-
return Ok(());
769-
}
770-
api_client
771-
.mkdir(current_system, path.parent().unwrap().to_path_buf())
772-
.await?;
773-
api_client
774-
.upload(current_system, path, DCGM_ENROOT_HOOK.as_bytes().to_vec())
775-
.await
776-
.wrap_err("couldn't upload dcgm enroot hook ".to_string())
777-
}
778753

779754
pub async fn cscs_job_start(
780755
name: Option<String>,
@@ -832,9 +807,6 @@ pub async fn cscs_job_start(
832807
if coman_squash.is_none() {
833808
println!("Warning: coman squash wasn't templated and is needed for ssh through coman to work");
834809
}
835-
if let Err(e) = setup_dcgm_hook(&api_client, current_system).await {
836-
println!("Warning: couldn't set up dcgm hook: {e:?}");
837-
}
838810
let environment_path = handle_edf(
839811
&api_client,
840812
&base_path,

0 commit comments

Comments
 (0)