Skip to content

Commit be760b3

Browse files
[rust_rqd] Add GPU discovery infrastructure to Rust RQD
Implement cross-platform GPU discovery and monitoring infrastructure for Rust RQD, mirroring the Python RQD architecture to enable robust GPU support across both implementations. 1) New module: `system/gpu.rs` - Add `GpuDiscovery` trait defining abstract GPU discovery interface with `detect_devices()` and `get_utilization()` methods - Implement `NvidiaGpuDiscovery` with NVML support (via optional `nvml-wrapper` crate) and `nvidia-smi` fallback for detailed NVIDIA GPU metadata collection - Implement `AppleMetalGpuDiscovery` for macOS Apple Silicon GPU detection via `system_profiler` JSON parsing - Add `create_gpu_discovery()` factory function for platform-specific backend selection (Linux - NVIDIA, macOS - Apple - Metal, Windows - NVIDIA) 2) `system/manager.rs`: - Import `GpuDevice` and `GpuUsage` from `opencue_proto::host` - Extend `MachineGpuStats` with `gpu_devices`: `Vec<GpuDevice>` for detailed GPU inventory alongside legacy `count`/`memory` fields - Extend `ProcessStats` with `gpu_usage`: `Vec<GpuUsage>` for `per-device` utilization tracking in running frames - Update `ProcessStats::default()` and `ProcessStats::update()` to handle new `gpu_usage` field 3) `system/mod.rs`: - Expose gpu module with pub mod gpu 4) `Cargo.toml`: - Add optional `nvml` feature flag for NVML support - Add `nvml-wrapper = { version = "0.10", optional = true }` dependency Architecture: - Trait-based abstraction matches Python class hierarchy for consistency - Optional NVML dependency via Cargo features allows compilation without NVIDIA-specific dependencies - Cross-platform design supports Linux (NVIDIA), macOS (Apple Metal), and Windows (NVIDIA) from the start - Backward compatible: retains legacy GPU fields in `MachineGpuStats` - Reuses `opencue_proto::host::{GpuDevice, GpuUsage}` proto messages directly Build with NVML: `cargo build --release --features nvml` Build without NVML: `cargo build --release (fallback to nvidia-smi)` Remaining integration work tracked in RUST_GPU_IMPLEMENTATION_SUMMARY.md: - Integrate GPU discovery into `MachineMonitor` - Populate `gpu_devices` in `RenderHost` reports - Add `CUDA_VISIBLE_DEVICES`/`NVIDIA_VISIBLE_DEVICES` environment variables - Collect per-frame GPU utilization during stats collection
1 parent ae28d59 commit be760b3

File tree

4 files changed

+294
-1
lines changed

4 files changed

+294
-1
lines changed

rust/crates/rqd/Cargo.toml

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -12,6 +12,8 @@ nimby = ["dep:device_query"]
1212
# Containerized Frames is a feature that allows rqd to run in a containerized environment.
1313
# This feature is highly experimental and may not be stable.
1414
containerized_frames = ["bollard"]
15+
# NVML support for NVIDIA GPU discovery and monitoring
16+
nvml = ["dep:nvml-wrapper"]
1517

1618
[[bin]]
1719
path = "src/main.rs"
@@ -66,6 +68,7 @@ device_query = { version = "3.0", optional = true }
6668
pnet = "0.35.0"
6769
log = "0.4.27"
6870
ureq = { version = "3.1.0", features = ["json"] }
71+
nvml-wrapper = { version = "0.10", optional = true }
6972

7073
[dev-dependencies]
7174
tempfile = "3.14.0"

rust/crates/rqd/src/system/gpu.rs

Lines changed: 283 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,283 @@
1+
use miette::Result;
2+
use opencue_proto::host::{GpuDevice, GpuUsage};
3+
use std::collections::HashMap;
4+
use tracing::{error, info, warn};
5+
6+
/// Abstract GPU discovery interface
7+
pub trait GpuDiscovery {
8+
/// Detect GPU devices on this machine
9+
fn detect_devices(&self) -> Result<Vec<GpuDevice>>;
10+
11+
/// Get current utilization for a specific GPU device
12+
fn get_utilization(&self, device_id: &str) -> Result<GpuUsage>;
13+
}
14+
15+
/// NVIDIA GPU discovery using NVML library
16+
pub struct NvidiaGpuDiscovery {
17+
nvml_available: bool,
18+
}
19+
20+
impl NvidiaGpuDiscovery {
21+
pub fn new() -> Self {
22+
let nvml_available = Self::check_nvml_available();
23+
if nvml_available {
24+
info!("Using NVML for NVIDIA GPU discovery");
25+
} else {
26+
warn!("NVML unavailable, GPU features will be limited");
27+
}
28+
Self { nvml_available }
29+
}
30+
31+
fn check_nvml_available() -> bool {
32+
#[cfg(feature = "nvml")]
33+
{
34+
match nvml_wrapper::Nvml::init() {
35+
Ok(_) => true,
36+
Err(e) => {
37+
warn!("NVML initialization failed: {}", e);
38+
false
39+
}
40+
}
41+
}
42+
#[cfg(not(feature = "nvml"))]
43+
{
44+
false
45+
}
46+
}
47+
48+
#[cfg(feature = "nvml")]
49+
fn detect_via_nvml(&self) -> Result<Vec<GpuDevice>> {
50+
use nvml_wrapper::Nvml;
51+
52+
let nvml = Nvml::init().map_err(|e| miette::miette!("NVML init failed: {}", e))?;
53+
let device_count = nvml.device_count().map_err(|e| miette::miette!("Failed to get device count: {}", e))?;
54+
55+
let mut devices = Vec::new();
56+
for i in 0..device_count {
57+
match nvml.device_by_index(i) {
58+
Ok(device) => {
59+
let name = device.name().unwrap_or_else(|_| "Unknown".to_string());
60+
let memory_info = device.memory_info().ok();
61+
let pci_info = device.pci_info().ok();
62+
let driver_version = nvml.sys_driver_version().unwrap_or_else(|_| "Unknown".to_string());
63+
let cuda_version = nvml.sys_cuda_driver_version().ok();
64+
65+
let gpu_device = GpuDevice {
66+
id: i.to_string(),
67+
vendor: "NVIDIA".to_string(),
68+
model: name,
69+
memory_bytes: memory_info.map(|m| m.total).unwrap_or(0),
70+
pci_bus: pci_info.map(|p| p.bus_id).unwrap_or_else(|| "Unknown".to_string()),
71+
driver_version,
72+
cuda_version: cuda_version.map(|v| format!("{}.{}", v / 1000, (v % 1000) / 10)).unwrap_or_else(|| "Unknown".to_string()),
73+
attributes: HashMap::new(),
74+
};
75+
devices.push(gpu_device);
76+
}
77+
Err(e) => {
78+
warn!("Failed to get device {}: {}", i, e);
79+
}
80+
}
81+
}
82+
Ok(devices)
83+
}
84+
85+
#[cfg(not(feature = "nvml"))]
86+
fn detect_via_nvml(&self) -> Result<Vec<GpuDevice>> {
87+
Ok(Vec::new())
88+
}
89+
90+
fn detect_via_smi(&self) -> Result<Vec<GpuDevice>> {
91+
use std::process::Command;
92+
93+
let output = Command::new("nvidia-smi")
94+
.args(&[
95+
"--query-gpu=index,name,memory.total,pci.bus_id,driver_version",
96+
"--format=csv,noheader,nounits",
97+
])
98+
.output()
99+
.map_err(|e| miette::miette!("Failed to run nvidia-smi: {}", e))?;
100+
101+
if !output.status.success() {
102+
return Err(miette::miette!("nvidia-smi command failed"));
103+
}
104+
105+
let stdout = String::from_utf8_lossy(&output.stdout);
106+
let mut devices = Vec::new();
107+
108+
for line in stdout.lines() {
109+
let parts: Vec<&str> = line.split(',').map(|s| s.trim()).collect();
110+
if parts.len() >= 5 {
111+
let memory_mb: f64 = parts[2].parse().unwrap_or(0.0);
112+
let memory_bytes = (memory_mb * 1_048_576.0) as u64; // MB to bytes
113+
114+
let gpu_device = GpuDevice {
115+
id: parts[0].to_string(),
116+
vendor: "NVIDIA".to_string(),
117+
model: parts[1].to_string(),
118+
memory_bytes,
119+
pci_bus: parts[3].to_string(),
120+
driver_version: parts[4].to_string(),
121+
cuda_version: "Unknown".to_string(),
122+
attributes: HashMap::new(),
123+
};
124+
devices.push(gpu_device);
125+
}
126+
}
127+
128+
Ok(devices)
129+
}
130+
}
131+
132+
impl GpuDiscovery for NvidiaGpuDiscovery {
133+
fn detect_devices(&self) -> Result<Vec<GpuDevice>> {
134+
if self.nvml_available {
135+
self.detect_via_nvml()
136+
} else {
137+
self.detect_via_smi()
138+
}
139+
}
140+
141+
fn get_utilization(&self, device_id: &str) -> Result<GpuUsage> {
142+
#[cfg(feature = "nvml")]
143+
{
144+
if self.nvml_available {
145+
use nvml_wrapper::Nvml;
146+
147+
let nvml = Nvml::init().map_err(|e| miette::miette!("NVML init failed: {}", e))?;
148+
let index: u32 = device_id.parse().map_err(|e| miette::miette!("Invalid device ID: {}", e))?;
149+
let device = nvml.device_by_index(index).map_err(|e| miette::miette!("Device not found: {}", e))?;
150+
151+
let utilization = device.utilization_rates().ok();
152+
let memory_info = device.memory_info().ok();
153+
let temperature = device.temperature(nvml_wrapper::enum_wrappers::device::TemperatureSensor::Gpu).ok();
154+
155+
return Ok(GpuUsage {
156+
device_id: device_id.to_string(),
157+
utilization_pct: utilization.map(|u| u.gpu).unwrap_or(0),
158+
memory_used_bytes: memory_info.map(|m| m.used).unwrap_or(0),
159+
temperature_c: temperature.unwrap_or(0),
160+
});
161+
}
162+
}
163+
164+
// Fallback: return empty usage
165+
Ok(GpuUsage {
166+
device_id: device_id.to_string(),
167+
utilization_pct: 0,
168+
memory_used_bytes: 0,
169+
temperature_c: 0,
170+
})
171+
}
172+
}
173+
174+
/// Apple Metal GPU discovery for macOS
175+
pub struct AppleMetalGpuDiscovery;
176+
177+
impl AppleMetalGpuDiscovery {
178+
pub fn new() -> Self {
179+
Self
180+
}
181+
182+
fn parse_vram(vram_str: &str) -> u64 {
183+
// Parse strings like "16 GB" or "16384 MB" to bytes
184+
let parts: Vec<&str> = vram_str.split_whitespace().collect();
185+
if parts.len() >= 2 {
186+
if let Ok(value) = parts[0].parse::<u64>() {
187+
match parts[1] {
188+
"GB" => return value * 1024 * 1024 * 1024,
189+
"MB" => return value * 1024 * 1024,
190+
_ => {}
191+
}
192+
}
193+
}
194+
0
195+
}
196+
}
197+
198+
impl GpuDiscovery for AppleMetalGpuDiscovery {
199+
fn detect_devices(&self) -> Result<Vec<GpuDevice>> {
200+
use std::process::Command;
201+
202+
let output = Command::new("system_profiler")
203+
.args(&["SPDisplaysDataType", "-json"])
204+
.output()
205+
.map_err(|e| miette::miette!("Failed to run system_profiler: {}", e))?;
206+
207+
if !output.status.success() {
208+
return Err(miette::miette!("system_profiler command failed"));
209+
}
210+
211+
let stdout = String::from_utf8_lossy(&output.stdout);
212+
let json_data: serde_json::Value = serde_json::from_str(&stdout)
213+
.map_err(|e| miette::miette!("Failed to parse JSON: {}", e))?;
214+
215+
let mut devices = Vec::new();
216+
let mut gpu_idx = 0;
217+
218+
if let Some(displays) = json_data["SPDisplaysDataType"].as_array() {
219+
for display in displays {
220+
let chipset_model = display["sppci_model"]
221+
.as_str()
222+
.unwrap_or("Unknown")
223+
.to_string();
224+
let vram = display["spdisplays_vram"]
225+
.as_str()
226+
.unwrap_or("0 MB")
227+
.to_string();
228+
let vram_bytes = Self::parse_vram(&vram);
229+
230+
let mut attributes = HashMap::new();
231+
attributes.insert("metal_supported".to_string(), "true".to_string());
232+
233+
let gpu_device = GpuDevice {
234+
id: gpu_idx.to_string(),
235+
vendor: "Apple".to_string(),
236+
model: chipset_model,
237+
memory_bytes: vram_bytes,
238+
pci_bus: "integrated".to_string(),
239+
driver_version: "Metal".to_string(),
240+
cuda_version: "N/A".to_string(),
241+
attributes,
242+
};
243+
devices.push(gpu_device);
244+
gpu_idx += 1;
245+
}
246+
}
247+
248+
Ok(devices)
249+
}
250+
251+
fn get_utilization(&self, device_id: &str) -> Result<GpuUsage> {
252+
// Apple Metal does not expose per-process GPU utilization
253+
Ok(GpuUsage {
254+
device_id: device_id.to_string(),
255+
utilization_pct: 0,
256+
memory_used_bytes: 0,
257+
temperature_c: 0,
258+
})
259+
}
260+
}
261+
262+
/// Factory function to create the appropriate GPU discovery backend for this platform
263+
pub fn create_gpu_discovery() -> Option<Box<dyn GpuDiscovery + Send + Sync>> {
264+
#[cfg(target_os = "linux")]
265+
{
266+
Some(Box::new(NvidiaGpuDiscovery::new()))
267+
}
268+
269+
#[cfg(target_os = "macos")]
270+
{
271+
Some(Box::new(AppleMetalGpuDiscovery::new()))
272+
}
273+
274+
#[cfg(target_os = "windows")]
275+
{
276+
Some(Box::new(NvidiaGpuDiscovery::new()))
277+
}
278+
279+
#[cfg(not(any(target_os = "linux", target_os = "macos", target_os = "windows")))]
280+
{
281+
None
282+
}
283+
}

rust/crates/rqd/src/system/manager.rs

Lines changed: 7 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,7 @@
11
use std::collections::HashMap;
22

33
use miette::{Diagnostic, Result};
4-
use opencue_proto::{host::HardwareState, report::ChildrenProcStats};
4+
use opencue_proto::{host::{GpuDevice, GpuUsage, HardwareState}, report::ChildrenProcStats};
55
use thiserror::Error;
66
use tracing::error;
77
use uuid::Uuid;
@@ -99,6 +99,8 @@ pub struct MachineGpuStats {
9999
pub free_memory: u64,
100100
/// Used memory by unit of each GPU, where the key in the HashMap is the unit ID, and the value is the used memory
101101
pub _used_memory_by_unit: HashMap<u32, u64>,
102+
/// Detailed GPU device inventory
103+
pub gpu_devices: Vec<GpuDevice>,
102104
}
103105

104106
/// Tracks memory and runtime statistics for a rendering process and its children.
@@ -118,6 +120,8 @@ pub struct ProcessStats {
118120
pub max_used_gpu_memory: u64,
119121
/// Current GPU memory usage (KB).
120122
pub used_gpu_memory: u64,
123+
/// Per-device GPU usage statistics
124+
pub gpu_usage: Vec<GpuUsage>,
121125
/// Additional data about the running frame's child processes.
122126
pub children: Option<ChildrenProcStats>,
123127
/// Unix timestamp denoting the start time of the frame process.
@@ -136,6 +140,7 @@ impl Default for ProcessStats {
136140
llu_time: 0,
137141
max_used_gpu_memory: 0,
138142
used_gpu_memory: 0,
143+
gpu_usage: Vec::new(),
139144
children: None,
140145
epoch_start_time: std::time::SystemTime::now()
141146
.duration_since(std::time::UNIX_EPOCH)
@@ -157,6 +162,7 @@ impl ProcessStats {
157162
vsize: new.vsize,
158163
llu_time: new.llu_time,
159164
used_gpu_memory: new.used_gpu_memory,
165+
gpu_usage: new.gpu_usage,
160166
children: new.children,
161167
epoch_start_time: new.epoch_start_time,
162168
};

rust/crates/rqd/src/system/mod.rs

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,6 @@
11
use uuid::Uuid;
22

3+
pub mod gpu;
34
pub mod linux;
45
pub mod machine;
56
#[cfg(feature = "nimby")]

0 commit comments

Comments
 (0)