Skip to content

Commit d698c39

Browse files
[rqd/proto] Add robust GPU support with cross-platform discovery and per-device tracking
Implement Phase 1 of comprehensive GPU support enhancement: - Issue: [GPU/Proto/RQD/Cuebot/RESTGateway/CueAdmin/CueGUI] OpenCue GPU Support - Comprehensive Audit and Implementation Plan - #2035 Protobuf schema extensions: - Add `GpuDevice` message with `vendor`, `model`, `memory`, `PCI bus`, `driver version`, and `CUDA`/`Metal` version fields to `host.proto` - Add `GpuUsage` message for per-device utilization tracking (`util %`, `memory used`) - Extend `Host` and `NestedHost` messages with `gpu_devices` repeated field - Extend `RenderHost` with `gpu_devices` for detailed GPU inventory reporting - Extend `RunningFrameInfo` with `gpu_usage` for per-frame GPU metrics - Add GPU constraint fields to Layer: `gpu_vendor`, `gpu_models_allowed`, `min_gpu_memory_bytes` for scheduler filtering - Add `gpu_usage` to `Frame` and `UpdatedFrame` messages for accounting RQD GPU discovery implementation: - Implement `GpuDiscovery` abstract base class for pluggable GPU backends - Implement `NvidiaGpuDiscovery` with `NVML` (`pynvml`) support and `nvidia-smi` fallback for detailed NVIDIA GPU metadata collection - Implement `AppleMetalGpuDiscovery` for macOS Apple Silicon GPU detection via `system_profiler` JSON parsing - Update Machine class with platform-specific GPU discovery initialization (Linux - NVIDIA, Darwin - Apple Metal, Windows - NVIDIA) - Populate `gpu_devices` in `RenderHost` for all platforms (`Linux`, `macOS`, `Windows`) GPU isolation and monitoring: - Set `CUDA_VISIBLE_DEVICES` and `NVIDIA_VISIBLE_DEVICES` environment variables in `rqcore.py` for proper GPU isolation in launched frames - Collect per-device GPU utilization in `__updateGpuAndLlu()` using new `getGpuUtilization()` method - Add `gpuUsage` list to `RunningFrame` class for tracking per-frame GPU metrics - Extend `runningFrameInfo()` to include `gpu_usage` in `RunningFrameInfo` proto Dependencies: - Add pynvml > = 11.5.0 to `rqd/pyproject.toml` for `NVML` GPU querying All changes maintain backward compatibility via optional/repeated proto fields. Legacy `num_gpus` and `gpu_memory` fields preserved for existing clients.
1 parent 95d93c2 commit d698c39

File tree

7 files changed

+263
-3
lines changed

7 files changed

+263
-3
lines changed

proto/src/host.proto

Lines changed: 22 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -225,6 +225,26 @@ enum ThreadMode {
225225

226226
// -------- Primary Message Types --------]
227227

228+
// GPU device information for detailed GPU inventory
229+
message GpuDevice {
230+
string id = 1; // Device ID (e.g., "0", "1")
231+
string vendor = 2; // "NVIDIA", "AMD", "Apple", "Intel"
232+
string model = 3; // "Tesla V100", "Apple M3 Max", etc.
233+
uint64 memory_bytes = 4; // Total memory in bytes
234+
string pci_bus = 5; // PCI bus ID (e.g., "0000:01:00.0") or "integrated"
235+
string driver_version = 6; // Driver version
236+
string cuda_version = 7; // CUDA compute capability (e.g., "7.0") or Metal version
237+
map<string, string> attributes = 8; // Extensible metadata
238+
}
239+
240+
// Per-GPU utilization telemetry
241+
message GpuUsage {
242+
string device_id = 1; // Matches GpuDevice.id
243+
uint32 utilization_pct = 2; // 0-100
244+
uint64 memory_used_bytes = 3; // Current memory usage in bytes
245+
uint32 temperature_c = 4; // Temperature in Celsius (optional)
246+
}
247+
228248
message Deed {
229249
string id = 1;
230250
string host = 2;
@@ -274,6 +294,7 @@ message Host {
274294
ThreadMode thread_mode = 27;
275295
float gpus = 28;
276296
float idle_gpus = 29;
297+
repeated GpuDevice gpu_devices = 30; // Detailed GPU inventory (backward compatible)
277298
}
278299

279300
message HostSearchCriteria {
@@ -321,6 +342,7 @@ message NestedHost {
321342
NestedProcSeq procs = 28;
322343
float gpus = 29;
323344
float idle_gpus = 30;
345+
repeated GpuDevice gpu_devices = 31; // Detailed GPU inventory (backward compatible)
324346
}
325347

326348
message NestedHostSeq {

proto/src/job.proto

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -9,9 +9,12 @@ option go_package = "opencue_gateway/gen/go";
99

1010
import "comment.proto";
1111
import "depend.proto";
12+
import "host.proto";
1213
import "limit.proto";
1314
import "renderPartition.proto";
1415

16+
// Note: GpuUsage is defined in host.proto
17+
1518
// Job related messages and services
1619
// This includes Job, Layer, Frame, and Group objects
1720

@@ -520,6 +523,7 @@ message Frame {
520523
int64 max_gpu_memory = 21;
521524
int64 used_gpu_memory = 22;
522525
FrameStateDisplayOverride frame_state_display_override = 23;
526+
repeated host.GpuUsage gpu_usage = 24; // Per-device GPU usage snapshot
523527
}
524528

525529
// Object for frame searching
@@ -566,6 +570,7 @@ message UpdatedFrame {
566570
int64 max_gpu_memory = 11;
567571
int64 used_gpu_memory = 12;
568572
FrameStateDisplayOverride frame_state_display_override = 13;
573+
repeated host.GpuUsage gpu_usage = 14; // Per-device GPU usage snapshot
569574
}
570575

571576
message UpdatedFrameSeq {
@@ -714,6 +719,9 @@ message Layer {
714719
float min_gpus = 20;
715720
float max_gpus = 21;
716721
string command = 22;
722+
string gpu_vendor = 23; // GPU vendor filter: "NVIDIA", "AMD", "Apple", "" (any)
723+
repeated string gpu_models_allowed = 24; // GPU model whitelist: ["Tesla V100", "A100"], empty = any
724+
uint64 min_gpu_memory_bytes = 25; // Minimum GPU memory per device in bytes (more precise than min_gpu_memory)
717725
}
718726

719727
message LayerSeq {

proto/src/report.proto

Lines changed: 5 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -9,6 +9,8 @@ option go_package = "opencue_gateway/gen/go";
99

1010
import "host.proto";
1111

12+
// Note: GpuDevice and GpuUsage are defined in host.proto
13+
1214
// Interface to handle RQD pings.
1315

1416

@@ -82,9 +84,10 @@ message RenderHost {
8284
repeated string tags = 15; // an array of default tags that are added to the host record
8385
host.HardwareState state = 16; // hardware state for the host
8486
map<string, string> attributes = 17; // additional data can be provided about the host
85-
int32 num_gpus = 18; // the number of physical GPU's
87+
int32 num_gpus = 18; // the number of physical GPU's (legacy, use gpu_devices for details)
8688
int64 free_gpu_mem = 19; // the current amount of free gpu memory in kB
8789
int64 total_gpu_mem = 20; // the total size of gpu memory in kB
90+
repeated host.GpuDevice gpu_devices = 21; // Detailed GPU inventory
8891
};
8992

9093
message RunningFrameInfo {
@@ -107,6 +110,7 @@ message RunningFrameInfo {
107110
int64 used_gpu_memory = 17; // kB
108111
ChildrenProcStats children = 18; //additional data about the running frame's child processes
109112
int64 used_swap_memory = 19; // kB
113+
repeated host.GpuUsage gpu_usage = 20; // Per-device GPU usage
110114
};
111115

112116
message ChildrenProcStats {

rqd/pyproject.toml

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -11,7 +11,8 @@ dependencies = [
1111
"opencue_proto",
1212
"psutil==5.9.8",
1313
"pynput==1.7.6",
14-
"future==1.0.0"
14+
"future==1.0.0",
15+
"pynvml>=11.5.0"
1516
]
1617
requires-python = ">3.7"
1718
description = "RQD is a software client that runs on all hosts doing work for an OpenCue deployment."

rqd/rqd/rqcore.py

Lines changed: 5 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -820,7 +820,11 @@ def __createEnvVariables(self):
820820

821821
# Add GPU's to use all assigned GPU cores
822822
if 'GPU_LIST' in self.runFrame.attributes:
823-
self.frameEnv['CUE_GPU_CORES'] = self.runFrame.attributes['GPU_LIST']
823+
gpu_list = self.runFrame.attributes['GPU_LIST']
824+
self.frameEnv['CUE_GPU_CORES'] = gpu_list
825+
# Set CUDA_VISIBLE_DEVICES and NVIDIA_VISIBLE_DEVICES for GPU isolation
826+
self.frameEnv['CUDA_VISIBLE_DEVICES'] = gpu_list
827+
self.frameEnv['NVIDIA_VISIBLE_DEVICES'] = gpu_list
824828

825829
# pylint: disable=inconsistent-return-statements
826830
def _createCommandFile(self, command):

0 commit comments

Comments
 (0)