AcademySoftwareFoundation · ramonfigueiredo · Oct 9, 2025 · Oct 9, 2025 · Oct 16, 2025
@@ -1 +1 @@
-1.13
+1.14
@@ -225,6 +225,26 @@ enum ThreadMode {
 
 // -------- Primary Message Types --------]
 
+// GPU device information for detailed GPU inventory
+message GpuDevice {
+    string id = 1;                      // Device ID (e.g., "0", "1")
+    string vendor = 2;                  // "NVIDIA", "AMD", "Apple", "Intel"
+    string model = 3;                   // "Tesla V100", "Apple M3 Max", etc.
+    uint64 memory_bytes = 4;            // Total memory in bytes
+    string pci_bus = 5;                 // PCI bus ID (e.g., "0000:01:00.0") or "integrated"
+    string driver_version = 6;          // Driver version
+    string cuda_version = 7;            // CUDA compute capability (e.g., "7.0") or Metal version
+    map<string, string> attributes = 8; // Extensible metadata
+}
+
+// Per-GPU utilization telemetry
+message GpuUsage {
+    string device_id = 1;               // Matches GpuDevice.id
+    uint32 utilization_pct = 2;         // 0-100
+    uint64 memory_used_bytes = 3;       // Current memory usage in bytes
+    uint32 temperature_c = 4;           // Temperature in Celsius (optional)
+}
+
 message Deed {
     string id = 1;
     string host = 2;
@@ -274,6 +294,7 @@ message Host {
     ThreadMode thread_mode = 27;
     float gpus = 28;
     float idle_gpus = 29;
+    repeated GpuDevice gpu_devices = 30;  // Detailed GPU inventory (backward compatible)
 }
 
 message HostSearchCriteria {
@@ -321,6 +342,7 @@ message NestedHost {
     NestedProcSeq procs = 28;
     float gpus = 29;
     float idle_gpus = 30;
+    repeated GpuDevice gpu_devices = 31;  // Detailed GPU inventory (backward compatible)
 }
 
 message NestedHostSeq {

@@ -9,9 +9,12 @@ option go_package = "opencue_gateway/gen/go";
 
 import "comment.proto";
 import "depend.proto";
+import "host.proto";
 import "limit.proto";
 import "renderPartition.proto";
 
+// Note: GpuUsage is defined in host.proto
+
 // Job related messages and services
 // This includes Job, Layer, Frame, and Group objects
 
@@ -520,6 +523,7 @@ message Frame {
     int64 max_gpu_memory = 21;
     int64 used_gpu_memory = 22;
     FrameStateDisplayOverride frame_state_display_override = 23;
+    repeated host.GpuUsage gpu_usage = 24; // Per-device GPU usage snapshot
 }
 
 // Object for frame searching
@@ -566,6 +570,7 @@ message UpdatedFrame {
     int64 max_gpu_memory = 11;
     int64 used_gpu_memory = 12;
     FrameStateDisplayOverride frame_state_display_override = 13;
+    repeated host.GpuUsage gpu_usage = 14; // Per-device GPU usage snapshot
 }
 
 message UpdatedFrameSeq {
@@ -714,6 +719,9 @@ message Layer {
     float min_gpus = 20;
     float max_gpus = 21;
     string command = 22;
+    string gpu_vendor = 23;               // GPU vendor filter: "NVIDIA", "AMD", "Apple", "" (any)
+    repeated string gpu_models_allowed = 24; // GPU model whitelist: ["Tesla V100", "A100"], empty = any
+    uint64 min_gpu_memory_bytes = 25;     // Minimum GPU memory per device in bytes (more precise than min_gpu_memory)
 }
 
 message LayerSeq {

@@ -9,6 +9,8 @@ option go_package = "opencue_gateway/gen/go";
 
 import "host.proto";
 
+// Note: GpuDevice and GpuUsage are defined in host.proto
+
 // Interface to handle RQD pings.
 
 
@@ -82,9 +84,10 @@ message RenderHost {
     repeated string tags = 15; // an array of default tags that are added to the host record
     host.HardwareState state = 16; // hardware state for the host
     map<string, string> attributes = 17; // additional data can be provided about the host
-    int32 num_gpus = 18; // the number of physical GPU's
+    int32 num_gpus = 18; // the number of physical GPU's (legacy, use gpu_devices for details)
     int64 free_gpu_mem = 19; // the current amount of free gpu memory in kB
     int64 total_gpu_mem = 20; // the total size of gpu memory in kB
+    repeated host.GpuDevice gpu_devices = 21; // Detailed GPU inventory
 };
 
 message RunningFrameInfo {
@@ -107,6 +110,7 @@ message RunningFrameInfo {
     int64 used_gpu_memory = 17; // kB
     ChildrenProcStats children = 18; //additional data about the running frame's child processes
     int64 used_swap_memory = 19; // kB
+    repeated host.GpuUsage gpu_usage = 20; // Per-device GPU usage
 };
 
 message ChildrenProcStats {

@@ -11,7 +11,8 @@ dependencies = [
     "opencue_proto",
     "psutil==5.9.8",
     "pynput==1.7.6",
-    "future==1.0.0"
+    "future==1.0.0",
+    "pynvml>=11.5.0"
 ]
 requires-python = ">3.7"
 description = "RQD is a software client that runs on all hosts doing work for an OpenCue deployment."

@@ -820,7 +820,11 @@ def __createEnvVariables(self):
 
         # Add GPU's to use all assigned GPU cores
         if 'GPU_LIST' in self.runFrame.attributes:
-            self.frameEnv['CUE_GPU_CORES'] = self.runFrame.attributes['GPU_LIST']
+            gpu_list = self.runFrame.attributes['GPU_LIST']
+            self.frameEnv['CUE_GPU_CORES'] = gpu_list
+            # Set CUDA_VISIBLE_DEVICES and NVIDIA_VISIBLE_DEVICES for GPU isolation
+            self.frameEnv['CUDA_VISIBLE_DEVICES'] = gpu_list
+            self.frameEnv['NVIDIA_VISIBLE_DEVICES'] = gpu_list
 
     # pylint: disable=inconsistent-return-statements
     def _createCommandFile(self, command):