[rqd/proto] Add robust GPU support with cross-platform discovery and per-device tracking

ramonfigueiredo · ramonfigueiredo · commit d698c391c506 · 2025-10-08T18:34:54.000-07:00
Implement Phase 1 of comprehensive GPU support enhancement: - Issue: [GPU/Proto/RQD/Cuebot/RESTGateway/CueAdmin/CueGUI] OpenCue GPU Support - Comprehensive Audit and Implementation Plan - #2035 Protobuf schema extensions: - Add `GpuDevice` message with `vendor`, `model`, `memory`, `PCI bus`, `driver version`, and `CUDA`/`Metal` version fields to `host.proto` - Add `GpuUsage` message for per-device utilization tracking (`util %`, `memory used`) - Extend `Host` and `NestedHost` messages with `gpu_devices` repeated field - Extend `RenderHost` with `gpu_devices` for detailed GPU inventory reporting - Extend `RunningFrameInfo` with `gpu_usage` for per-frame GPU metrics - Add GPU constraint fields to Layer: `gpu_vendor`, `gpu_models_allowed`, `min_gpu_memory_bytes` for scheduler filtering - Add `gpu_usage` to `Frame` and `UpdatedFrame` messages for accounting RQD GPU discovery implementation: - Implement `GpuDiscovery` abstract base class for pluggable GPU backends - Implement `NvidiaGpuDiscovery` with `NVML` (`pynvml`) support and `nvidia-smi` fallback for detailed NVIDIA GPU metadata collection - Implement `AppleMetalGpuDiscovery` for macOS Apple Silicon GPU detection via `system_profiler` JSON parsing - Update Machine class with platform-specific GPU discovery initialization (Linux - NVIDIA, Darwin - Apple Metal, Windows - NVIDIA) - Populate `gpu_devices` in `RenderHost` for all platforms (`Linux`, `macOS`, `Windows`) GPU isolation and monitoring: - Set `CUDA_VISIBLE_DEVICES` and `NVIDIA_VISIBLE_DEVICES` environment variables in `rqcore.py` for proper GPU isolation in launched frames - Collect per-device GPU utilization in `__updateGpuAndLlu()` using new `getGpuUtilization()` method - Add `gpuUsage` list to `RunningFrame` class for tracking per-frame GPU metrics - Extend `runningFrameInfo()` to include `gpu_usage` in `RunningFrameInfo` proto Dependencies: - Add pynvml > = 11.5.0 to `rqd/pyproject.toml` for `NVML` GPU querying All changes maintain backward compatibility via optional/repeated proto fields. Legacy `num_gpus` and `gpu_memory` fields preserved for existing clients.
diff --git a/proto/src/host.proto b/proto/src/host.proto
@@ -225,6 +225,26 @@ enum ThreadMode {
 
 // -------- Primary Message Types --------]
 
+// GPU device information for detailed GPU inventory
+message GpuDevice {
+    string id = 1;                      // Device ID (e.g., "0", "1")
+    string vendor = 2;                  // "NVIDIA", "AMD", "Apple", "Intel"
+    string model = 3;                   // "Tesla V100", "Apple M3 Max", etc.
+    uint64 memory_bytes = 4;            // Total memory in bytes
+    string pci_bus = 5;                 // PCI bus ID (e.g., "0000:01:00.0") or "integrated"
+    string driver_version = 6;          // Driver version
+    string cuda_version = 7;            // CUDA compute capability (e.g., "7.0") or Metal version
+    map<string, string> attributes = 8; // Extensible metadata
+}
+
+// Per-GPU utilization telemetry
+message GpuUsage {
+    string device_id = 1;               // Matches GpuDevice.id
+    uint32 utilization_pct = 2;         // 0-100
+    uint64 memory_used_bytes = 3;       // Current memory usage in bytes
+    uint32 temperature_c = 4;           // Temperature in Celsius (optional)
+}
+
 message Deed {
     string id = 1;
     string host = 2;
@@ -274,6 +294,7 @@ message Host {
     ThreadMode thread_mode = 27;
     float gpus = 28;
     float idle_gpus = 29;
+    repeated GpuDevice gpu_devices = 30;  // Detailed GPU inventory (backward compatible)
 }
 
 message HostSearchCriteria {
@@ -321,6 +342,7 @@ message NestedHost {
     NestedProcSeq procs = 28;
     float gpus = 29;
     float idle_gpus = 30;
+    repeated GpuDevice gpu_devices = 31;  // Detailed GPU inventory (backward compatible)
 }
 
 message NestedHostSeq {
diff --git a/proto/src/job.proto b/proto/src/job.proto
@@ -9,9 +9,12 @@ option go_package = "opencue_gateway/gen/go";
 
 import "comment.proto";
 import "depend.proto";
+import "host.proto";
 import "limit.proto";
 import "renderPartition.proto";
 
+// Note: GpuUsage is defined in host.proto
+
 // Job related messages and services
 // This includes Job, Layer, Frame, and Group objects
 
@@ -520,6 +523,7 @@ message Frame {
     int64 max_gpu_memory = 21;
     int64 used_gpu_memory = 22;
     FrameStateDisplayOverride frame_state_display_override = 23;
+    repeated host.GpuUsage gpu_usage = 24; // Per-device GPU usage snapshot
 }
 
 // Object for frame searching
@@ -566,6 +570,7 @@ message UpdatedFrame {
     int64 max_gpu_memory = 11;
     int64 used_gpu_memory = 12;
     FrameStateDisplayOverride frame_state_display_override = 13;
+    repeated host.GpuUsage gpu_usage = 14; // Per-device GPU usage snapshot
 }
 
 message UpdatedFrameSeq {
@@ -714,6 +719,9 @@ message Layer {
     float min_gpus = 20;
     float max_gpus = 21;
     string command = 22;
+    string gpu_vendor = 23;               // GPU vendor filter: "NVIDIA", "AMD", "Apple", "" (any)
+    repeated string gpu_models_allowed = 24; // GPU model whitelist: ["Tesla V100", "A100"], empty = any
+    uint64 min_gpu_memory_bytes = 25;     // Minimum GPU memory per device in bytes (more precise than min_gpu_memory)
 }
 
 message LayerSeq {
diff --git a/proto/src/report.proto b/proto/src/report.proto
@@ -9,6 +9,8 @@ option go_package = "opencue_gateway/gen/go";
 
 import "host.proto";
 
+// Note: GpuDevice and GpuUsage are defined in host.proto
+
 // Interface to handle RQD pings.
 
 
@@ -82,9 +84,10 @@ message RenderHost {
     repeated string tags = 15; // an array of default tags that are added to the host record
     host.HardwareState state = 16; // hardware state for the host
     map<string, string> attributes = 17; // additional data can be provided about the host
-    int32 num_gpus = 18; // the number of physical GPU's
+    int32 num_gpus = 18; // the number of physical GPU's (legacy, use gpu_devices for details)
     int64 free_gpu_mem = 19; // the current amount of free gpu memory in kB
     int64 total_gpu_mem = 20; // the total size of gpu memory in kB
+    repeated host.GpuDevice gpu_devices = 21; // Detailed GPU inventory
 };
 
 message RunningFrameInfo {
@@ -107,6 +110,7 @@ message RunningFrameInfo {
     int64 used_gpu_memory = 17; // kB
     ChildrenProcStats children = 18; //additional data about the running frame's child processes
     int64 used_swap_memory = 19; // kB
+    repeated host.GpuUsage gpu_usage = 20; // Per-device GPU usage
 };
 
 message ChildrenProcStats {
diff --git a/rqd/pyproject.toml b/rqd/pyproject.toml
@@ -11,7 +11,8 @@ dependencies = [
     "opencue_proto",
     "psutil==5.9.8",
     "pynput==1.7.6",
-    "future==1.0.0"
+    "future==1.0.0",
+    "pynvml>=11.5.0"
 ]
 requires-python = ">3.7"
 description = "RQD is a software client that runs on all hosts doing work for an OpenCue deployment."
diff --git a/rqd/rqd/rqcore.py b/rqd/rqd/rqcore.py
@@ -820,7 +820,11 @@ def __createEnvVariables(self):
 
         # Add GPU's to use all assigned GPU cores
         if 'GPU_LIST' in self.runFrame.attributes:
-            self.frameEnv['CUE_GPU_CORES'] = self.runFrame.attributes['GPU_LIST']
+            gpu_list = self.runFrame.attributes['GPU_LIST']
+            self.frameEnv['CUE_GPU_CORES'] = gpu_list
+            # Set CUDA_VISIBLE_DEVICES and NVIDIA_VISIBLE_DEVICES for GPU isolation
+            self.frameEnv['CUDA_VISIBLE_DEVICES'] = gpu_list
+            self.frameEnv['NVIDIA_VISIBLE_DEVICES'] = gpu_list
 
     # pylint: disable=inconsistent-return-statements
     def _createCommandFile(self, command):
diff --git a/rqd/rqd/rqmachine.py b/rqd/rqd/rqmachine.py
diff --git a/rqd/rqd/rqnetwork.py b/rqd/rqd/rqnetwork.py