Skip to content
Draft
Show file tree
Hide file tree
Changes from 2 commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion VERSION.in
Original file line number Diff line number Diff line change
@@ -1 +1 @@
1.13
1.14
22 changes: 22 additions & 0 deletions proto/src/host.proto
Original file line number Diff line number Diff line change
Expand Up @@ -225,6 +225,26 @@ enum ThreadMode {

// -------- Primary Message Types --------]

// GPU device information for detailed GPU inventory
message GpuDevice {
string id = 1; // Device ID (e.g., "0", "1")
string vendor = 2; // "NVIDIA", "AMD", "Apple", "Intel"
string model = 3; // "Tesla V100", "Apple M3 Max", etc.
uint64 memory_bytes = 4; // Total memory in bytes
string pci_bus = 5; // PCI bus ID (e.g., "0000:01:00.0") or "integrated"
string driver_version = 6; // Driver version
string cuda_version = 7; // CUDA compute capability (e.g., "7.0") or Metal version
map<string, string> attributes = 8; // Extensible metadata
}

// Per-GPU utilization telemetry
message GpuUsage {
string device_id = 1; // Matches GpuDevice.id
uint32 utilization_pct = 2; // 0-100
uint64 memory_used_bytes = 3; // Current memory usage in bytes
uint32 temperature_c = 4; // Temperature in Celsius (optional)
}

message Deed {
string id = 1;
string host = 2;
Expand Down Expand Up @@ -274,6 +294,7 @@ message Host {
ThreadMode thread_mode = 27;
float gpus = 28;
float idle_gpus = 29;
repeated GpuDevice gpu_devices = 30; // Detailed GPU inventory (backward compatible)
}

message HostSearchCriteria {
Expand Down Expand Up @@ -321,6 +342,7 @@ message NestedHost {
NestedProcSeq procs = 28;
float gpus = 29;
float idle_gpus = 30;
repeated GpuDevice gpu_devices = 31; // Detailed GPU inventory (backward compatible)
}

message NestedHostSeq {
Expand Down
8 changes: 8 additions & 0 deletions proto/src/job.proto
Original file line number Diff line number Diff line change
Expand Up @@ -9,9 +9,12 @@ option go_package = "opencue_gateway/gen/go";

import "comment.proto";
import "depend.proto";
import "host.proto";
import "limit.proto";
import "renderPartition.proto";

// Note: GpuUsage is defined in host.proto

// Job related messages and services
// This includes Job, Layer, Frame, and Group objects

Expand Down Expand Up @@ -520,6 +523,7 @@ message Frame {
int64 max_gpu_memory = 21;
int64 used_gpu_memory = 22;
FrameStateDisplayOverride frame_state_display_override = 23;
repeated host.GpuUsage gpu_usage = 24; // Per-device GPU usage snapshot
}

// Object for frame searching
Expand Down Expand Up @@ -566,6 +570,7 @@ message UpdatedFrame {
int64 max_gpu_memory = 11;
int64 used_gpu_memory = 12;
FrameStateDisplayOverride frame_state_display_override = 13;
repeated host.GpuUsage gpu_usage = 14; // Per-device GPU usage snapshot
}

message UpdatedFrameSeq {
Expand Down Expand Up @@ -714,6 +719,9 @@ message Layer {
float min_gpus = 20;
float max_gpus = 21;
string command = 22;
string gpu_vendor = 23; // GPU vendor filter: "NVIDIA", "AMD", "Apple", "" (any)
repeated string gpu_models_allowed = 24; // GPU model whitelist: ["Tesla V100", "A100"], empty = any
uint64 min_gpu_memory_bytes = 25; // Minimum GPU memory per device in bytes (more precise than min_gpu_memory)
}

message LayerSeq {
Expand Down
6 changes: 5 additions & 1 deletion proto/src/report.proto
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,8 @@ option go_package = "opencue_gateway/gen/go";

import "host.proto";

// Note: GpuDevice and GpuUsage are defined in host.proto

// Interface to handle RQD pings.


Expand Down Expand Up @@ -82,9 +84,10 @@ message RenderHost {
repeated string tags = 15; // an array of default tags that are added to the host record
host.HardwareState state = 16; // hardware state for the host
map<string, string> attributes = 17; // additional data can be provided about the host
int32 num_gpus = 18; // the number of physical GPU's
int32 num_gpus = 18; // the number of physical GPU's (legacy, use gpu_devices for details)
int64 free_gpu_mem = 19; // the current amount of free gpu memory in kB
int64 total_gpu_mem = 20; // the total size of gpu memory in kB
repeated host.GpuDevice gpu_devices = 21; // Detailed GPU inventory
};

message RunningFrameInfo {
Expand All @@ -107,6 +110,7 @@ message RunningFrameInfo {
int64 used_gpu_memory = 17; // kB
ChildrenProcStats children = 18; //additional data about the running frame's child processes
int64 used_swap_memory = 19; // kB
repeated host.GpuUsage gpu_usage = 20; // Per-device GPU usage
};

message ChildrenProcStats {
Expand Down
3 changes: 2 additions & 1 deletion rqd/pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,8 @@ dependencies = [
"opencue_proto",
"psutil==5.9.8",
"pynput==1.7.6",
"future==1.0.0"
"future==1.0.0",
"pynvml>=11.5.0"
]
requires-python = ">3.7"
description = "RQD is a software client that runs on all hosts doing work for an OpenCue deployment."
Expand Down
6 changes: 5 additions & 1 deletion rqd/rqd/rqcore.py
Original file line number Diff line number Diff line change
Expand Up @@ -820,7 +820,11 @@ def __createEnvVariables(self):

# Add GPU's to use all assigned GPU cores
if 'GPU_LIST' in self.runFrame.attributes:
self.frameEnv['CUE_GPU_CORES'] = self.runFrame.attributes['GPU_LIST']
gpu_list = self.runFrame.attributes['GPU_LIST']
self.frameEnv['CUE_GPU_CORES'] = gpu_list
# Set CUDA_VISIBLE_DEVICES and NVIDIA_VISIBLE_DEVICES for GPU isolation
self.frameEnv['CUDA_VISIBLE_DEVICES'] = gpu_list
self.frameEnv['NVIDIA_VISIBLE_DEVICES'] = gpu_list

# pylint: disable=inconsistent-return-statements
def _createCommandFile(self, command):
Expand Down
Loading
Loading