Skip to content

Commit 33f932c

Browse files
committed
switch to nvidia-smi
1 parent 43de006 commit 33f932c

8 files changed

Lines changed: 48 additions & 163 deletions

File tree

Cargo.lock

Lines changed: 0 additions & 46 deletions
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

coman/Cargo.toml

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -91,7 +91,6 @@ tarpc = { version = "0.37.0", features = [
9191
] }
9292
tokio-duplex = "1.0.1"
9393
sysinfo = "0.38.0"
94-
nvml-wrapper = "0.11.0"
9594
bytesize = "2.3.1"
9695

9796
[build-dependencies]

coman/src/cli/rpc.rs

Lines changed: 25 additions & 20 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,6 @@
1+
use bytesize::ByteSize;
12
use futures::StreamExt;
23
use iroh::protocol::ProtocolHandler;
3-
use nvml_wrapper::Nvml;
44
use serde::{Deserialize, Serialize};
55
use sysinfo::{ProcessRefreshKind, ProcessesToUpdate, System, get_current_pid};
66
use tarpc::{
@@ -16,7 +16,7 @@ pub struct ResourceUsage {
1616
pub cpu: f32,
1717
pub rss: u64,
1818
pub vss: u64,
19-
pub gpu: Option<u64>,
19+
pub gpu: Option<Vec<(u64, u64)>>,
2020
}
2121

2222
#[tarpc::service]
@@ -43,25 +43,30 @@ impl ComanRPC for RpcServer {
4343
let Some(process) = sys.process(pid) else {
4444
return ResourceUsage::default();
4545
};
46-
let gpu_usage = match Nvml::init() {
47-
Ok(nvml) => match nvml.device_by_index(0) {
48-
Ok(device) => match device.memory_info() {
49-
Ok(memory_info) => Some(memory_info.used),
50-
Err(e) => {
51-
println!("Couldn't get GPU memory info: {e:?}");
52-
None
53-
}
54-
},
55-
Err(e) => {
56-
println!("couldn't load nvidia device 0: {e:?}");
57-
None
58-
}
59-
},
60-
Err(e) => {
61-
println!("Nvidia Device Info not available: {e:?}");
62-
None
63-
}
46+
let gpu_usage = if let Ok(output) = std::process::Command::new("nvidia-smi")
47+
.args(vec![
48+
"--query-gpu=memory.total,memory.used",
49+
"--format=csv,noheader,nounits",
50+
])
51+
.output()
52+
{
53+
let output = String::from_utf8_lossy(&output.stdout);
54+
let usage = output
55+
.lines()
56+
.map(|l| l.split_once(",").unwrap())
57+
.map(|(total, used)| {
58+
(
59+
ByteSize::mib(total.trim().parse::<u64>().unwrap()).as_u64(),
60+
ByteSize::mib(used.trim().parse::<u64>().unwrap()).as_u64(),
61+
)
62+
})
63+
.collect();
64+
Some(usage)
65+
} else {
66+
println!("Failed to execute nvidia-smi, maybe it's not installed");
67+
None
6468
};
69+
6570
ResourceUsage {
6671
cpu: process.cpu_usage() / sys.cpus().len() as f32,
6772
rss: process.memory(),

coman/src/cscs/api_client/client.rs

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -222,8 +222,8 @@ impl CscsApi {
222222
None => Ok("".to_string()),
223223
}
224224
}
225-
pub async fn list_path(&self, system_name: &str, path: PathBuf) -> Result<Vec<PathEntry>> {
226-
let result = get_filesystem_ops_ls(&self.client, system_name, path)
225+
pub async fn list_path(&self, system_name: &str, path: PathBuf, show_hidden: bool) -> Result<Vec<PathEntry>> {
226+
let result = get_filesystem_ops_ls(&self.client, system_name, path, show_hidden)
227227
.await
228228
.wrap_err("couldn't list path")?;
229229
match result.output {

coman/src/cscs/cli.rs

Lines changed: 12 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -149,7 +149,18 @@ pub(crate) async fn cli_cscs_job_resource_usage(
149149
result.cpu,
150150
ByteSize::b(result.rss).display().iec(),
151151
ByteSize::b(result.vss).display().iec(),
152-
result.gpu.map(|g| g.to_string()).unwrap_or("N/A".to_string())
152+
result
153+
.gpu
154+
.map(|g| g
155+
.into_iter()
156+
.map(|(total, used)| format!(
157+
"{}/{}({:.1}%)",
158+
ByteSize::b(used).display().iec(),
159+
ByteSize::b(total).display().iec(),
160+
used as f64 / total as f64 * 100.0
161+
))
162+
.join(", "))
163+
.unwrap_or("N/A".to_string())
153164
);
154165

155166
Ok(())

coman/src/cscs/dcgm_enroot_hook.sh

Lines changed: 0 additions & 60 deletions
This file was deleted.

coman/src/cscs/handlers.rs

Lines changed: 4 additions & 32 deletions
Original file line numberDiff line numberDiff line change
@@ -52,7 +52,6 @@ use crate::{
5252
};
5353

5454
const CSCS_MAX_DIRECT_SIZE: usize = 5242880;
55-
const DCGM_ENROOT_HOOK: &str = include_str!("./dcgm_enroot_hook.sh");
5655

5756
async fn get_access_token() -> Result<Secret> {
5857
let client_id = match get_secret(CLIENT_ID_SECRET_NAME).await {
@@ -751,30 +750,6 @@ async fn handle_script(
751750

752751
Ok(script_path)
753752
}
754-
async fn setup_dcgm_hook(api_client: &CscsApi, current_system: &str) -> Result<()> {
755-
let user_dirs = file_system_roots(Some(FileSystemType::Users)).await?;
756-
let user_dir = user_dirs
757-
.first()
758-
.ok_or(eyre!("couldn't find user root directory on remote"))?;
759-
let path = PathBuf::from(user_dir.name.clone())
760-
.join(".config")
761-
.join("enroot")
762-
.join("hooks.d")
763-
.join("cscs_jobreport_dcgm_hook.sh");
764-
765-
let response = api_client.checksum(current_system, path.clone()).await;
766-
if let Ok(Some(_)) = response {
767-
// file exists
768-
return Ok(());
769-
}
770-
api_client
771-
.mkdir(current_system, path.parent().unwrap().to_path_buf())
772-
.await?;
773-
api_client
774-
.upload(current_system, path, DCGM_ENROOT_HOOK.as_bytes().to_vec())
775-
.await
776-
.wrap_err("couldn't upload dcgm enroot hook ".to_string())
777-
}
778753

779754
pub async fn cscs_job_start(
780755
name: Option<String>,
@@ -832,9 +807,6 @@ pub async fn cscs_job_start(
832807
if coman_squash.is_none() {
833808
println!("Warning: coman squash wasn't templated and is needed for ssh through coman to work");
834809
}
835-
if let Err(e) = setup_dcgm_hook(&api_client, current_system).await {
836-
println!("Warning: couldn't set up dcgm hook: {e:?}");
837-
}
838810
let environment_path = handle_edf(
839811
&api_client,
840812
&base_path,
@@ -890,7 +862,7 @@ pub async fn cscs_file_list(
890862
let api_client = CscsApi::new(access_token.0, platform).unwrap();
891863
let config = Config::new().unwrap();
892864
api_client
893-
.list_path(&system.unwrap_or(config.values.cscs.current_system), path)
865+
.list_path(&system.unwrap_or(config.values.cscs.current_system), path, false)
894866
.await
895867
}
896868
Err(e) => Err(e),
@@ -946,7 +918,7 @@ pub async fn cscs_file_delete(
946918
let api_client = CscsApi::new(access_token.0, platform).unwrap();
947919
let config = Config::new().unwrap();
948920
let current_system = &system.unwrap_or(config.values.cscs.current_system);
949-
let paths = api_client.list_path(current_system, remote.clone()).await?;
921+
let paths = api_client.list_path(current_system, remote.clone(), false).await?;
950922
let path = paths.first().ok_or(eyre!("remote path doesn't exist"))?;
951923
if let PathType::Directory = path.path_type {
952924
return Err(eyre!("remote path must be a file, not directory"));
@@ -975,7 +947,7 @@ pub async fn cscs_file_download(
975947
let api_client = CscsApi::new(access_token.0, platform).unwrap();
976948
let config = Config::new().unwrap();
977949
let current_system = &system.unwrap_or(config.values.cscs.current_system);
978-
let paths = api_client.list_path(current_system, remote.clone()).await?;
950+
let paths = api_client.list_path(current_system, remote.clone(), false).await?;
979951
let path = paths.first().ok_or(eyre!("remote path doesn't exist"))?;
980952
if let PathType::Directory = path.path_type {
981953
return Err(eyre!("remote path must be a file, not directory"));
@@ -1008,7 +980,7 @@ pub async fn cscs_file_upload(
1008980
let api_client = CscsApi::new(access_token.0, platform).unwrap();
1009981
let config = Config::new().unwrap();
1010982
let current_system = &system.unwrap_or(config.values.cscs.current_system);
1011-
let existing = api_client.list_path(current_system, remote.clone()).await?;
983+
let existing = api_client.list_path(current_system, remote.clone(), false).await?;
1012984
let remote = if !existing.is_empty() {
1013985
if existing.len() == 1 && existing[0].path_type == PathType::File {
1014986
return Err(eyre!("remote file already exists"));

firecrest_client/src/filesystem_api.rs

Lines changed: 5 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -16,12 +16,16 @@ pub async fn get_filesystem_ops_ls(
1616
client: &FirecrestClient,
1717
system_name: &str,
1818
path: PathBuf,
19+
show_hidden: bool,
1920
) -> Result<GetDirectoryLsResponse> {
2021
let path = path.as_os_str().to_str().ok_or(eyre!("couldn't cast path to string"))?;
2122
let response = client
2223
.get(
2324
format!("filesystem/{system_name}/ops/ls").as_str(),
24-
Some(vec![("path", path), ("showHidden", "true")]),
25+
Some(vec![
26+
("path", path),
27+
("showHidden", if show_hidden { "true" } else { "false" }),
28+
]),
2529
)
2630
.await?;
2731
let model: GetDirectoryLsResponse = serde_json::from_str(response.as_str())?;

0 commit comments

Comments
 (0)