From b204f73c0de9bf4f66cacf4a0adf6377dd4e0f53 Mon Sep 17 00:00:00 2001 From: Dawid Ciepiela <71898979+sarumaj@users.noreply.github.com> Date: Tue, 5 Aug 2025 20:32:18 +0200 Subject: [PATCH 1/5] sysinfo: add support for amd and nvidia gpus --- docs/GPU_MONITORING.md | 145 ++++++ frontend/app/theme.scss | 1 + frontend/app/view/sysinfo/sysinfo.tsx | 41 ++ pkg/wshrpc/wshremote/sysinfo.go | 644 ++++++++++++++++++++++++++ pkg/wshrpc/wshremote/sysinfo_test.go | 93 ++++ pkg/wshrpc/wshrpctypes.go | 1 + 6 files changed, 925 insertions(+) create mode 100644 docs/GPU_MONITORING.md create mode 100644 pkg/wshrpc/wshremote/sysinfo_test.go diff --git a/docs/GPU_MONITORING.md b/docs/GPU_MONITORING.md new file mode 100644 index 0000000000..cbce77d1c5 --- /dev/null +++ b/docs/GPU_MONITORING.md @@ -0,0 +1,145 @@ +# GPU Monitoring Support + +This document describes the GPU monitoring functionality added to the sysinfo system. + +## Overview + +The GPU monitoring feature uses command-line tools to collect GPU metrics across multiple platforms and GPU vendors. Instead of relying on Go modules, it uses `exec.Command` to execute platform-specific GPU monitoring tools. + +## Supported Platforms and Tools + +### Linux +- **NVIDIA GPUs**: Uses `nvidia-smi` command + - Collects: GPU utilization, memory usage, memory total, temperature + - Command: `nvidia-smi --query-gpu=index,utilization.gpu,memory.used,memory.total,temperature.gpu --format=csv,noheader,nounits` + +- **AMD GPUs**: Uses `rocm-smi` command + - Collects: Memory usage, memory total, temperature + - Command: `rocm-smi --showproductname --showmeminfo vram --showtemp` + +### macOS +- Uses multiple commands for comprehensive GPU monitoring: + - `system_profiler SPDisplaysDataType` - Gets GPU names and VRAM information + - `iostat` - Attempts to get GPU utilization (if available) + - `vm_stat` - Gets memory pressure information + - `sysctl hw.memsize` - Gets system memory for estimation +- Collects: GPU names, estimated utilization, memory usage, memory total +- Automatically detects multiple GPUs (integrated + discrete) +- Estimates GPU memory based on system memory if VRAM info unavailable +- Provides more detailed information than basic system_profiler output + +### Windows +- Uses PowerShell commands for comprehensive GPU monitoring: + - `Get-WmiObject -Class Win32_VideoController` - Gets GPU names and memory information + - `Get-Counter "\GPU Engine(*)\Utilization Percentage"` - Gets GPU utilization + - `Get-Counter "\GPU Adapter Memory(*)\Dedicated Usage"` - Gets GPU memory usage +- Collects: GPU names, utilization, memory usage, memory total +- Automatically detects multiple GPUs (integrated + discrete) +- Filters out basic/standard display adapters to focus on dedicated GPUs +- Provides real-time GPU utilization using Windows Performance Counters + +## Data Structure + +GPU data is collected in the following format: + +```go +type GpuData struct { + Index int `json:"index"` // GPU index + Util float64 `json:"util"` // GPU utilization percentage + MemUsed float64 `json:"mem_used"` // Memory used in GB + MemTotal float64 `json:"mem_total"` // Total memory in GB + Temp float64 `json:"temp"` // Temperature in Celsius +} +``` + +## Metrics Collected + +The system collects the following metrics for each GPU: + +- `gpu` - Average GPU utilization across all GPUs +- `gpu:{index}:util` - GPU utilization for specific GPU +- `gpu:{index}:mem_used` - Memory used for specific GPU +- `gpu:{index}:mem_total` - Total memory for specific GPU +- `gpu:{index}:temp` - Temperature for specific GPU + +## Frontend Plot Types + +The frontend supports the following GPU-related plot types: + +- **GPU**: Shows average GPU utilization +- **All GPU**: Shows utilization for all individual GPUs +- **GPU Memory**: Shows memory usage for all GPUs +- **CPU + GPU**: Shows both CPU and GPU utilization + +## Implementation Details + +### Platform Detection +The system automatically detects the platform using `uname -s` and selects the appropriate GPU monitoring method. + +### Tool Availability Detection +Before attempting to collect GPU data, the system checks if the required tools (`nvidia-smi` or `rocm-smi`) are available on the system. + +### macOS Improvements +The macOS implementation has been significantly enhanced to provide more comprehensive GPU monitoring: + +1. **Multiple GPU Detection**: Parses `system_profiler` output to detect both integrated and discrete GPUs +2. **VRAM Information**: Extracts VRAM size from system_profiler output using regex patterns +3. **Memory Pressure**: Uses `vm_stat` to calculate memory usage and pressure +4. **GPU Utilization**: Attempts to get GPU utilization from `iostat` output +5. **Memory Estimation**: Falls back to estimating GPU memory based on system memory if VRAM info is unavailable +6. **Error Handling**: Gracefully handles missing commands and parsing errors + +### Windows Implementation +The Windows implementation provides comprehensive GPU monitoring using PowerShell: + +1. **GPU Detection**: Uses `Get-WmiObject -Class Win32_VideoController` to detect all GPUs +2. **Memory Information**: Extracts adapter RAM size and converts to GB +3. **GPU Utilization**: Uses Windows Performance Counters to get real-time GPU utilization +4. **Memory Usage**: Tracks dedicated GPU memory usage using performance counters +5. **Multi-GPU Support**: Automatically detects and monitors multiple GPUs +6. **Filtering**: Excludes basic/standard display adapters to focus on dedicated GPUs +7. **Error Handling**: Gracefully handles PowerShell execution errors and missing counters + +### Error Handling +- If no GPU tools are available, the system gracefully continues without GPU data +- Timeouts are set for all command executions to prevent hanging +- Parsing errors are handled gracefully + +### Performance +- GPU data collection is integrated into the existing sysinfo loop +- Commands are executed with timeouts to prevent blocking +- Data is collected every second along with CPU and memory data + +## Usage + +To use GPU monitoring: + +1. Ensure you have the appropriate GPU monitoring tools installed: + - For NVIDIA: Install NVIDIA drivers (includes `nvidia-smi`) + - For AMD: Install ROCm (includes `rocm-smi`) + +2. The GPU data will automatically appear in sysinfo blocks when available + +3. Select GPU plot types from the sysinfo view settings menu + +## Testing + +Run the tests to verify GPU functionality: + +```bash +cd pkg/wshrpc/wshremote +go test -v +``` + +The tests will check: +- Platform detection +- Tool availability +- GPU data collection + +## Future Enhancements + +- Windows GPU monitoring support +- More detailed macOS GPU monitoring +- GPU power consumption metrics +- GPU fan speed monitoring +- Support for additional GPU vendors (Intel, etc.) \ No newline at end of file diff --git a/frontend/app/theme.scss b/frontend/app/theme.scss index ac5d1cf857..aae1db6160 100644 --- a/frontend/app/theme.scss +++ b/frontend/app/theme.scss @@ -116,6 +116,7 @@ --sysinfo-cpu-color: #58c142; --sysinfo-mem-color: #53b4ea; + --sysinfo-gpu-color: #ff6b35; --bulb-color: rgb(255, 221, 51); diff --git a/frontend/app/view/sysinfo/sysinfo.tsx b/frontend/app/view/sysinfo/sysinfo.tsx index dc26528498..32fe7443e8 100644 --- a/frontend/app/view/sysinfo/sysinfo.tsx +++ b/frontend/app/view/sysinfo/sysinfo.tsx @@ -66,6 +66,33 @@ const PlotTypes: Object = { return valA - valB; }); }, + GPU: function (dataItem: DataItem): Array { + return ["gpu"]; + }, + "All GPU": function (dataItem: DataItem): Array { + return Object.keys(dataItem) + .filter((item) => item.startsWith("gpu:") && item.includes(":util")) + .sort((a, b) => { + const valA = parseInt(a.replace("gpu:", "").replace(":util", "")); + const valB = parseInt(b.replace("gpu:", "").replace(":util", "")); + return valA - valB; + }); + }, + "GPU Memory": function (dataItem: DataItem): Array { + return Object.keys(dataItem) + .filter((item) => item.startsWith("gpu:") && item.includes(":mem_used")) + .sort((a, b) => { + const valA = parseInt(a.replace("gpu:", "").replace(":mem_used", "")); + const valB = parseInt(b.replace("gpu:", "").replace(":mem_used", "")); + return valA - valB; + }); + }, + "CPU + GPU": function (dataItem: DataItem): Array { + return ["cpu", "gpu"]; + }, + "CPU + GPU + Mem": function (dataItem: DataItem): Array { + return ["cpu", "gpu", "mem:used"]; + }, }; const DefaultPlotMeta = { @@ -74,10 +101,24 @@ const DefaultPlotMeta = { "mem:used": defaultMemMeta("Memory Used", "mem:total"), "mem:free": defaultMemMeta("Memory Free", "mem:total"), "mem:available": defaultMemMeta("Memory Available", "mem:total"), + gpu: defaultCpuMeta("GPU %"), }; for (let i = 0; i < 32; i++) { DefaultPlotMeta[`cpu:${i}`] = defaultCpuMeta(`Core ${i}`); } +for (let i = 0; i < 8; i++) { + DefaultPlotMeta[`gpu:${i}:util`] = defaultCpuMeta(`GPU ${i} %`); + DefaultPlotMeta[`gpu:${i}:mem_used`] = defaultMemMeta(`GPU ${i} Memory Used`, "gpu:0:mem_total"); + DefaultPlotMeta[`gpu:${i}:mem_total`] = defaultMemMeta(`GPU ${i} Memory Total`, "gpu:0:mem_total"); + DefaultPlotMeta[`gpu:${i}:temp`] = { + name: `GPU ${i} Temperature`, + label: "°C", + miny: 0, + maxy: 100, + color: "var(--sysinfo-gpu-color)", + decimalPlaces: 0, + }; +} function convertWaveEventToDataItem(event: WaveEvent): DataItem { const eventData: TimeSeriesData = event.data; diff --git a/pkg/wshrpc/wshremote/sysinfo.go b/pkg/wshrpc/wshremote/sysinfo.go index c573c4d9d1..7a70eb5967 100644 --- a/pkg/wshrpc/wshremote/sysinfo.go +++ b/pkg/wshrpc/wshremote/sysinfo.go @@ -4,8 +4,12 @@ package wshremote import ( + "context" "log" + "os/exec" + "regexp" "strconv" + "strings" "time" "github.com/shirou/gopsutil/v4/cpu" @@ -18,6 +22,645 @@ import ( const BYTES_PER_GB = 1073741824 +// GPU data structure to hold parsed GPU information +type GpuData struct { + Index int `json:"index"` + Util float64 `json:"util"` + MemUsed float64 `json:"mem_used"` + MemTotal float64 `json:"mem_total"` + Temp float64 `json:"temp"` +} + +// Platform detection +func detectPlatform() string { + ctx, cancel := context.WithTimeout(context.Background(), 2*time.Second) + defer cancel() + + cmd := exec.CommandContext(ctx, "uname", "-s") + output, err := cmd.Output() + if err != nil { + return "unknown" + } + + os := strings.ToLower(strings.TrimSpace(string(output))) + switch { + case strings.Contains(os, "linux"): + return "linux" + case strings.Contains(os, "darwin"): + return "darwin" + case strings.Contains(os, "windows"): + return "windows" + default: + return "unknown" + } +} + +// Check if nvidia-smi is available +func isNvidiaSmiAvailable() bool { + ctx, cancel := context.WithTimeout(context.Background(), 2*time.Second) + defer cancel() + + cmd := exec.CommandContext(ctx, "nvidia-smi", "--version") + return cmd.Run() == nil +} + +// Check if rocm-smi is available (AMD GPUs) +func isRocmSmiAvailable() bool { + ctx, cancel := context.WithTimeout(context.Background(), 2*time.Second) + defer cancel() + + cmd := exec.CommandContext(ctx, "rocm-smi", "--version") + return cmd.Run() == nil +} + +// Get GPU data using nvidia-smi +func getNvidiaGpuData() ([]GpuData, error) { + ctx, cancel := context.WithTimeout(context.Background(), 5*time.Second) + defer cancel() + + cmd := exec.CommandContext(ctx, "nvidia-smi", "--query-gpu=index,utilization.gpu,memory.used,memory.total,temperature.gpu", "--format=csv,noheader,nounits") + output, err := cmd.Output() + if err != nil { + return nil, err + } + + var gpus []GpuData + lines := strings.Split(strings.TrimSpace(string(output)), "\n") + + for _, line := range lines { + fields := strings.Split(line, ", ") + if len(fields) >= 5 { + index, _ := strconv.Atoi(strings.TrimSpace(fields[0])) + util, _ := strconv.ParseFloat(strings.TrimSpace(fields[1]), 64) + memUsed, _ := strconv.ParseFloat(strings.TrimSpace(fields[2]), 64) + memTotal, _ := strconv.ParseFloat(strings.TrimSpace(fields[3]), 64) + temp, _ := strconv.ParseFloat(strings.TrimSpace(fields[4]), 64) + + gpus = append(gpus, GpuData{ + Index: index, + Util: util, + MemUsed: memUsed, + MemTotal: memTotal, + Temp: temp, + }) + } + } + + return gpus, nil +} + +// Get GPU data using rocm-smi (AMD) +func getRocmGpuData() ([]GpuData, error) { + ctx, cancel := context.WithTimeout(context.Background(), 5*time.Second) + defer cancel() + + cmd := exec.CommandContext(ctx, "rocm-smi", "--showproductname", "--showmeminfo", "vram", "--showtemp") + output, err := cmd.Output() + if err != nil { + return nil, err + } + + var gpus []GpuData + // Parse rocm-smi output - this is more complex as it's not CSV format + // For now, we'll implement a basic parser + + // Simple regex to extract GPU info + re := regexp.MustCompile(`GPU\s+(\d+).*?VRAM Total:\s+(\d+)\s+MB.*?VRAM Used:\s+(\d+)\s+MB.*?Temperature:\s+(\d+)`) + matches := re.FindAllStringSubmatch(string(output), -1) + + for _, match := range matches { + if len(match) >= 5 { + index, _ := strconv.Atoi(match[1]) + memTotal, _ := strconv.ParseFloat(match[2], 64) + memUsed, _ := strconv.ParseFloat(match[3], 64) + temp, _ := strconv.ParseFloat(match[4], 64) + + // Convert MB to GB + memTotal = memTotal / 1024 + memUsed = memUsed / 1024 + + gpus = append(gpus, GpuData{ + Index: index, + Util: 0, // rocm-smi doesn't provide utilization in the same way + MemUsed: memUsed, + MemTotal: memTotal, + Temp: temp, + }) + } + } + + return gpus, nil +} + +// Get GPU data for macOS using multiple commands for better coverage +func getMacGpuData() ([]GpuData, error) { + var gpus []GpuData + + // Try to get GPU info using system_profiler first + ctx, cancel := context.WithTimeout(context.Background(), 5*time.Second) + defer cancel() + + cmd := exec.CommandContext(ctx, "system_profiler", "SPDisplaysDataType") + output, err := cmd.Output() + if err != nil { + return nil, err + } + + // Parse system_profiler output to get GPU names and basic info + gpuNames := parseSystemProfilerOutput(string(output)) + + // Try to get GPU utilization using iostat (if available) + gpuUtil := getMacGpuUtilization() + + // Try to get GPU memory info using vm_stat and other commands + gpuMem := getMacGpuMemory() + + // Create GPU data entries + for i, name := range gpuNames { + gpu := GpuData{ + Index: i, + Util: gpuUtil, + MemUsed: gpuMem.Used, + MemTotal: gpuMem.Total, + Temp: 0, // Temperature not easily available on macOS + } + gpus = append(gpus, gpu) + // Log GPU name for debugging + log.Printf("Found macOS GPU: %s", name) + } + + // If no GPUs found from system_profiler, create a default entry + if len(gpus) == 0 { + gpus = append(gpus, GpuData{ + Index: 0, + Util: gpuUtil, + MemUsed: gpuMem.Used, + MemTotal: gpuMem.Total, + Temp: 0, + }) + } + + return gpus, nil +} + +// Parse system_profiler output to extract GPU names +func parseSystemProfilerOutput(output string) []string { + var gpuNames []string + lines := strings.Split(output, "\n") + + for _, line := range lines { + line = strings.TrimSpace(line) + if strings.Contains(line, "Chipset Model:") { + // Extract GPU name + parts := strings.Split(line, ":") + if len(parts) >= 2 { + gpuName := strings.TrimSpace(parts[1]) + if gpuName != "" && gpuName != "Unknown" { + gpuNames = append(gpuNames, gpuName) + } + } + } + } + + return gpuNames +} + +// Get GPU utilization using iostat (if available) +func getMacGpuUtilization() float64 { + ctx, cancel := context.WithTimeout(context.Background(), 3*time.Second) + defer cancel() + + // Try to get GPU utilization using iostat + cmd := exec.CommandContext(ctx, "iostat", "-n", "1", "1") + output, err := cmd.Output() + if err != nil { + return 0 + } + + // Parse iostat output for GPU utilization + lines := strings.Split(string(output), "\n") + for _, line := range lines { + if strings.Contains(line, "gpu") || strings.Contains(line, "GPU") { + fields := strings.Fields(line) + if len(fields) >= 2 { + if util, err := strconv.ParseFloat(fields[1], 64); err == nil { + return util + } + } + } + } + + return 0 +} + +// GPU memory info structure +type GpuMemoryInfo struct { + Used float64 + Total float64 +} + +// Get GPU memory information using multiple methods +func getMacGpuMemory() GpuMemoryInfo { + var memInfo GpuMemoryInfo + + // Try to get total VRAM using system_profiler + ctx, cancel := context.WithTimeout(context.Background(), 3*time.Second) + defer cancel() + + cmd := exec.CommandContext(ctx, "system_profiler", "SPDisplaysDataType") + output, err := cmd.Output() + if err == nil { + memInfo.Total = parseVRAMFromSystemProfiler(string(output)) + } + + // Try to get memory pressure info using vm_stat + vmStat := getMemoryPressureFromVMStat() + if vmStat > 0 { + memInfo.Used = vmStat + } + + // If we couldn't get total VRAM, estimate based on system memory + if memInfo.Total == 0 { + memInfo.Total = estimateGPUMemory() + } + + return memInfo +} + +// Parse VRAM information from system_profiler output +func parseVRAMFromSystemProfiler(output string) float64 { + lines := strings.Split(output, "\n") + + for _, line := range lines { + line = strings.TrimSpace(line) + if strings.Contains(line, "VRAM") || strings.Contains(line, "Memory") { + // Look for memory size patterns like "8 GB", "4 GB", etc. + re := regexp.MustCompile(`(\d+(?:\.\d+)?)\s*(GB|MB)`) + matches := re.FindStringSubmatch(line) + if len(matches) >= 3 { + if size, err := strconv.ParseFloat(matches[1], 64); err == nil { + unit := matches[2] + if unit == "MB" { + return size / 1024 // Convert MB to GB + } + return size + } + } + } + } + + return 0 +} + +// Get memory pressure from vm_stat command +func getMemoryPressureFromVMStat() float64 { + ctx, cancel := context.WithTimeout(context.Background(), 3*time.Second) + defer cancel() + + cmd := exec.CommandContext(ctx, "vm_stat") + output, err := cmd.Output() + if err != nil { + return 0 + } + + lines := strings.Split(string(output), "\n") + var pageSize int64 = 4096 // Default page size + + // Find page size + for _, line := range lines { + if strings.Contains(line, "Mach Virtual Memory Statistics") { + re := regexp.MustCompile(`page size of (\d+)`) + matches := re.FindStringSubmatch(line) + if len(matches) >= 2 { + if size, err := strconv.ParseInt(matches[1], 10, 64); err == nil { + pageSize = size + } + } + break + } + } + + // Parse memory statistics + var activePages, inactivePages, wiredPages int64 + + for _, line := range lines { + line = strings.TrimSpace(line) + if strings.Contains(line, "Pages active:") { + re := regexp.MustCompile(`Pages active:\s+(\d+)`) + matches := re.FindStringSubmatch(line) + if len(matches) >= 2 { + if pages, err := strconv.ParseInt(matches[1], 10, 64); err == nil { + activePages = pages + } + } + } else if strings.Contains(line, "Pages inactive:") { + re := regexp.MustCompile(`Pages inactive:\s+(\d+)`) + matches := re.FindStringSubmatch(line) + if len(matches) >= 2 { + if pages, err := strconv.ParseInt(matches[1], 10, 64); err == nil { + inactivePages = pages + } + } + } else if strings.Contains(line, "Pages wired down:") { + re := regexp.MustCompile(`Pages wired down:\s+(\d+)`) + matches := re.FindStringSubmatch(line) + if len(matches) >= 2 { + if pages, err := strconv.ParseInt(matches[1], 10, 64); err == nil { + wiredPages = pages + } + } + } + } + + // Calculate used memory in GB + usedBytes := (activePages + inactivePages + wiredPages) * pageSize + return float64(usedBytes) / (1024 * 1024 * 1024) // Convert to GB +} + +// Estimate GPU memory based on system memory +func estimateGPUMemory() float64 { + ctx, cancel := context.WithTimeout(context.Background(), 3*time.Second) + defer cancel() + + cmd := exec.CommandContext(ctx, "sysctl", "hw.memsize") + output, err := cmd.Output() + if err != nil { + return 4.0 // Default estimate + } + + // Parse total system memory + re := regexp.MustCompile(`hw\.memsize:\s+(\d+)`) + matches := re.FindStringSubmatch(string(output)) + if len(matches) >= 2 { + if memSize, err := strconv.ParseInt(matches[1], 10, 64); err == nil { + totalGB := float64(memSize) / (1024 * 1024 * 1024) + // Estimate GPU memory as a fraction of system memory + // This is a rough estimate and varies by GPU + return totalGB * 0.1 // Assume 10% of system memory for GPU + } + } + + return 4.0 // Default estimate +} + +// Get GPU data for Windows using PowerShell commands +func getWindowsGpuData() ([]GpuData, error) { + var gpus []GpuData + + // Try to get GPU info using PowerShell + ctx, cancel := context.WithTimeout(context.Background(), 10*time.Second) + defer cancel() + + // PowerShell command to get GPU information + psCommand := ` + $gpus = Get-WmiObject -Class Win32_VideoController | Where-Object { $_.Name -notlike "*Basic*" -and $_.Name -notlike "*Standard*" } + $gpuInfo = @() + foreach ($gpu in $gpus) { + $gpuInfo += [PSCustomObject]@{ + Name = $gpu.Name + AdapterRAM = $gpu.AdapterRAM + VideoProcessor = $gpu.VideoProcessor + DriverVersion = $gpu.DriverVersion + } + } + $gpuInfo | ConvertTo-Json -Compress + ` + + cmd := exec.CommandContext(ctx, "powershell", "-Command", psCommand) + output, err := cmd.Output() + if err != nil { + return nil, err + } + + // Parse the JSON output + gpuList := parseWindowsGpuOutput(string(output)) + + // Get GPU utilization using a separate PowerShell command + gpuUtil := getWindowsGpuUtilization() + + // Get GPU memory usage + memUsage := getWindowsGpuMemoryUsage() + + // Create GPU data entries + for i, gpu := range gpuList { + // Try to find memory usage for this GPU + memUsed := 0.0 + for adapterName, usage := range memUsage { + if strings.Contains(strings.ToLower(adapterName), strings.ToLower(gpu.Name)) { + memUsed = usage + break + } + } + + gpuData := GpuData{ + Index: i, + Util: gpuUtil, + MemUsed: memUsed, + MemTotal: gpu.MemTotal, + Temp: 0, // Temperature requires additional tools on Windows + } + gpus = append(gpus, gpuData) + log.Printf("Found Windows GPU: %s (%.2f GB total, %.2f GB used)", gpu.Name, gpu.MemTotal, memUsed) + } + + // If no GPUs found, create a default entry + if len(gpus) == 0 { + gpus = append(gpus, GpuData{ + Index: 0, + Util: gpuUtil, + MemUsed: 0, + MemTotal: 0, + Temp: 0, + }) + } + + return gpus, nil +} + +// Windows GPU info structure +type WindowsGpuInfo struct { + Name string `json:"Name"` + MemTotal float64 `json:"MemTotal"` + MemUsed float64 `json:"MemUsed"` +} + +// Parse Windows GPU output from PowerShell +func parseWindowsGpuOutput(output string) []WindowsGpuInfo { + var gpuList []WindowsGpuInfo + + // Try to parse as JSON array + if strings.TrimSpace(output) == "" { + return gpuList + } + + // Simple JSON parsing for the PowerShell output + // The output should be a JSON array of GPU objects + lines := strings.Split(output, "\n") + for _, line := range lines { + line = strings.TrimSpace(line) + if line == "" { + continue + } + + // Extract GPU name and memory info using regex + nameRe := regexp.MustCompile(`"Name":\s*"([^"]+)"`) + adapterRamRe := regexp.MustCompile(`"AdapterRAM":\s*(\d+)`) + + nameMatches := nameRe.FindStringSubmatch(line) + ramMatches := adapterRamRe.FindStringSubmatch(line) + + if len(nameMatches) >= 2 && len(ramMatches) >= 2 { + name := nameMatches[1] + if ramSize, err := strconv.ParseInt(ramMatches[1], 10, 64); err == nil { + // Convert bytes to GB + memTotal := float64(ramSize) / (1024 * 1024 * 1024) + + gpuInfo := WindowsGpuInfo{ + Name: name, + MemTotal: memTotal, + MemUsed: 0, // Will be estimated based on utilization + } + gpuList = append(gpuList, gpuInfo) + } + } + } + + return gpuList +} + +// Get GPU utilization on Windows using PowerShell +func getWindowsGpuUtilization() float64 { + ctx, cancel := context.WithTimeout(context.Background(), 5*time.Second) + defer cancel() + + // PowerShell command to get GPU utilization + psCommand := ` + try { + $gpu = Get-Counter "\GPU Engine(*)\Utilization Percentage" -ErrorAction SilentlyContinue + if ($gpu) { + $maxUtil = ($gpu.CounterSamples | Measure-Object -Property CookedValue -Maximum).Maximum + [math]::Round($maxUtil, 2) + } else { + 0 + } + } catch { + 0 + } + ` + + cmd := exec.CommandContext(ctx, "powershell", "-Command", psCommand) + output, err := cmd.Output() + if err != nil { + return 0 + } + + // Parse the utilization value + utilStr := strings.TrimSpace(string(output)) + if util, err := strconv.ParseFloat(utilStr, 64); err == nil { + return util + } + + return 0 +} + +// Get GPU memory usage on Windows using PowerShell +func getWindowsGpuMemoryUsage() map[string]float64 { + ctx, cancel := context.WithTimeout(context.Background(), 5*time.Second) + defer cancel() + + // PowerShell command to get GPU memory usage + psCommand := ` + try { + $gpu = Get-Counter "\GPU Adapter Memory(*)\Dedicated Usage" -ErrorAction SilentlyContinue + $memUsage = @{} + foreach ($counter in $gpu.CounterSamples) { + $adapterName = $counter.InstanceName + $usage = $counter.CookedValue / 1GB + $memUsage[$adapterName] = [math]::Round($usage, 2) + } + $memUsage | ConvertTo-Json -Compress + } catch { + "{}" + } + ` + + cmd := exec.CommandContext(ctx, "powershell", "-Command", psCommand) + output, err := cmd.Output() + if err != nil { + return make(map[string]float64) + } + + // Parse the memory usage JSON + return parseWindowsGpuMemoryOutput(string(output)) +} + +// Parse Windows GPU memory output +func parseWindowsGpuMemoryOutput(output string) map[string]float64 { + memUsage := make(map[string]float64) + + // Simple JSON parsing for memory usage + lines := strings.Split(output, "\n") + for _, line := range lines { + line = strings.TrimSpace(line) + if line == "" { + continue + } + + // Extract adapter name and memory usage + re := regexp.MustCompile(`"([^"]+)":\s*([\d.]+)`) + matches := re.FindStringSubmatch(line) + if len(matches) >= 3 { + adapterName := matches[1] + if usage, err := strconv.ParseFloat(matches[2], 64); err == nil { + memUsage[adapterName] = usage + } + } + } + + return memUsage +} + +func getGpuData(values map[string]float64) { + platform := detectPlatform() + var gpus []GpuData + var err error + + switch platform { + case "linux": + if isNvidiaSmiAvailable() { + gpus, err = getNvidiaGpuData() + } else if isRocmSmiAvailable() { + gpus, err = getRocmGpuData() + } + case "darwin": + gpus, err = getMacGpuData() + case "windows": + gpus, err = getWindowsGpuData() + } + + if err != nil || len(gpus) == 0 { + return + } + + // Add GPU data to values map + for _, gpu := range gpus { + indexStr := strconv.Itoa(gpu.Index) + values[wshrpc.TimeSeries_Gpu+":"+indexStr+":util"] = gpu.Util + values[wshrpc.TimeSeries_Gpu+":"+indexStr+":mem_used"] = gpu.MemUsed + values[wshrpc.TimeSeries_Gpu+":"+indexStr+":mem_total"] = gpu.MemTotal + values[wshrpc.TimeSeries_Gpu+":"+indexStr+":temp"] = gpu.Temp + } + + // Add aggregate GPU utilization (average of all GPUs) + if len(gpus) > 0 { + totalUtil := 0.0 + for _, gpu := range gpus { + totalUtil += gpu.Util + } + values[wshrpc.TimeSeries_Gpu] = totalUtil / float64(len(gpus)) + } +} + func getCpuData(values map[string]float64) { percentArr, err := cpu.Percent(0, false) if err != nil { @@ -51,6 +694,7 @@ func generateSingleServerData(client *wshutil.WshRpc, connName string) { values := make(map[string]float64) getCpuData(values) getMemData(values) + getGpuData(values) // Add this line to get GPU data tsData := wshrpc.TimeSeriesData{Ts: now.UnixMilli(), Values: values} event := wps.WaveEvent{ Event: wps.Event_SysInfo, diff --git a/pkg/wshrpc/wshremote/sysinfo_test.go b/pkg/wshrpc/wshremote/sysinfo_test.go new file mode 100644 index 0000000000..8ea269332a --- /dev/null +++ b/pkg/wshrpc/wshremote/sysinfo_test.go @@ -0,0 +1,93 @@ +// Copyright 2025, Command Line Inc. +// SPDX-License-Identifier: Apache-2.0 + +package wshremote + +import ( + "fmt" + "strings" + "testing" +) + +func TestDetectPlatform(t *testing.T) { + platform := detectPlatform() + if platform == "" { + t.Error("Platform detection returned empty string") + } + t.Logf("Detected platform: %s", platform) +} + +func TestNvidiaSmiAvailability(t *testing.T) { + available := isNvidiaSmiAvailable() + t.Logf("nvidia-smi available: %v", available) +} + +func TestRocmSmiAvailability(t *testing.T) { + available := isRocmSmiAvailable() + t.Logf("rocm-smi available: %v", available) +} + +func TestGetGpuData(t *testing.T) { + values := make(map[string]float64) + getGpuData(values) + + // Check if any GPU data was collected + hasGpuData := false + for key := range values { + if key == "gpu" || (len(key) > 4 && key[:4] == "gpu:") { + hasGpuData = true + t.Logf("Found GPU data: %s = %f", key, values[key]) + } + } + + if !hasGpuData { + t.Log("No GPU data collected (this is normal if no GPU tools are available)") + } +} + +func TestMacOSGpuFunctions(t *testing.T) { + // Test system_profiler parsing + output := `Graphics/Displays: + Intel Iris Pro: + Chipset Model: Intel Iris Pro + VRAM (Dynamic, Max): 1536 MB + Resolution: 2560 x 1600 + NVIDIA GeForce GT 750M: + Chipset Model: NVIDIA GeForce GT 750M + VRAM (Total): 2048 MB` + + gpuNames := parseSystemProfilerOutput(output) + for i, name := range gpuNames { + gpuNames[i] = fmt.Sprintf("%q", name) + } + + t.Logf("Parsed GPU names: %s", strings.Join(gpuNames, ", ")) + + // Test VRAM parsing + vram := parseVRAMFromSystemProfiler(output) + t.Logf("Parsed VRAM: %f GB", vram) + + // Test memory pressure + memPressure := getMemoryPressureFromVMStat() + t.Logf("Memory pressure: %f GB", memPressure) + + // Test GPU memory estimation + estimatedMem := estimateGPUMemory() + t.Logf("Estimated GPU memory: %f GB", estimatedMem) +} + +func TestWindowsGpuFunctions(t *testing.T) { + // Test Windows GPU output parsing + output := `[{"Name":"NVIDIA GeForce RTX 3080","AdapterRAM":10737418240,"VideoProcessor":"NVIDIA GeForce RTX 3080","DriverVersion":"31.0.15.3179"}]` + + gpuList := parseWindowsGpuOutput(output) + t.Logf("Parsed Windows GPUs: %d", len(gpuList)) + for i, gpu := range gpuList { + t.Logf("GPU %d: %s (%.2f GB)", i, gpu.Name, gpu.MemTotal) + } + + // Test Windows GPU memory output parsing + memOutput := `{"NVIDIA GeForce RTX 3080":2.45}` + memUsage := parseWindowsGpuMemoryOutput(memOutput) + t.Logf("Memory usage: %v", memUsage) +} diff --git a/pkg/wshrpc/wshrpctypes.go b/pkg/wshrpc/wshrpctypes.go index 3d629533cf..e6f1e1ed9b 100644 --- a/pkg/wshrpc/wshrpctypes.go +++ b/pkg/wshrpc/wshrpctypes.go @@ -580,6 +580,7 @@ type RemoteInfo struct { const ( TimeSeries_Cpu = "cpu" + TimeSeries_Gpu = "gpu" ) type TimeSeriesData struct { From be0ea10ff5ba771215b3b0fc944494a543cec398 Mon Sep 17 00:00:00 2001 From: Dawid Ciepiela <71898979+sarumaj@users.noreply.github.com> Date: Tue, 5 Aug 2025 20:33:50 +0200 Subject: [PATCH 2/5] wshremote: fix unused output --- pkg/wshrpc/wshremote/wshremote.go | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/pkg/wshrpc/wshremote/wshremote.go b/pkg/wshrpc/wshremote/wshremote.go index ec90e367c7..7fc97793a0 100644 --- a/pkg/wshrpc/wshremote/wshremote.go +++ b/pkg/wshrpc/wshremote/wshremote.go @@ -810,9 +810,9 @@ func (*ServerImpl) RemoteWriteFileCommand(ctx context.Context, data wshrpc.FileD } defer utilfn.GracefulClose(file, "RemoteWriteFileCommand", path) if atOffset > 0 && !append { - n, err = file.WriteAt(dataBytes[:n], atOffset) + _, err = file.WriteAt(dataBytes[:n], atOffset) } else { - n, err = file.Write(dataBytes[:n]) + _, err = file.Write(dataBytes[:n]) } if err != nil { return fmt.Errorf("cannot write to file %q: %w", path, err) From 59257cd2313db03144c64061434bb1c4a2446957 Mon Sep 17 00:00:00 2001 From: Dawid Ciepiela <71898979+sarumaj@users.noreply.github.com> Date: Tue, 5 Aug 2025 22:03:12 +0200 Subject: [PATCH 3/5] refactor and apply suggestions --- docs/GPU_MONITORING.md | 1 - pkg/wshrpc/wshremote/sysinfo.go | 67 +++++++++++++++++---------------- 2 files changed, 35 insertions(+), 33 deletions(-) diff --git a/docs/GPU_MONITORING.md b/docs/GPU_MONITORING.md index cbce77d1c5..558458393b 100644 --- a/docs/GPU_MONITORING.md +++ b/docs/GPU_MONITORING.md @@ -138,7 +138,6 @@ The tests will check: ## Future Enhancements -- Windows GPU monitoring support - More detailed macOS GPU monitoring - GPU power consumption metrics - GPU fan speed monitoring diff --git a/pkg/wshrpc/wshremote/sysinfo.go b/pkg/wshrpc/wshremote/sysinfo.go index 7a70eb5967..e0602f6eee 100644 --- a/pkg/wshrpc/wshremote/sysinfo.go +++ b/pkg/wshrpc/wshremote/sysinfo.go @@ -8,6 +8,7 @@ import ( "log" "os/exec" "regexp" + "runtime" "strconv" "strings" "time" @@ -22,6 +23,20 @@ import ( const BYTES_PER_GB = 1073741824 +const PS_GPU_COMMAND = ` + $gpus = Get-WmiObject -Class Win32_VideoController | Where-Object { $_.Name -notlike "*Basic*" -and $_.Name -notlike "*Standard*" } + $gpuInfo = @() + foreach ($gpu in $gpus) { + $gpuInfo += [PSCustomObject]@{ + Name = $gpu.Name + AdapterRAM = $gpu.AdapterRAM + VideoProcessor = $gpu.VideoProcessor + DriverVersion = $gpu.DriverVersion + } + } + $gpuInfo | ConvertTo-Json -Compress +` + // GPU data structure to hold parsed GPU information type GpuData struct { Index int `json:"index"` @@ -33,22 +48,12 @@ type GpuData struct { // Platform detection func detectPlatform() string { - ctx, cancel := context.WithTimeout(context.Background(), 2*time.Second) - defer cancel() - - cmd := exec.CommandContext(ctx, "uname", "-s") - output, err := cmd.Output() - if err != nil { - return "unknown" - } - - os := strings.ToLower(strings.TrimSpace(string(output))) - switch { - case strings.Contains(os, "linux"): + switch runtime.GOOS { + case "linux": return "linux" - case strings.Contains(os, "darwin"): + case "darwin": return "darwin" - case strings.Contains(os, "windows"): + case "windows": return "windows" default: return "unknown" @@ -90,8 +95,16 @@ func getNvidiaGpuData() ([]GpuData, error) { for _, line := range lines { fields := strings.Split(line, ", ") if len(fields) >= 5 { - index, _ := strconv.Atoi(strings.TrimSpace(fields[0])) - util, _ := strconv.ParseFloat(strings.TrimSpace(fields[1]), 64) + index, err := strconv.Atoi(strings.TrimSpace(fields[0])) + if err != nil { + log.Printf("Error parsing nvidia-smi output: %v", err) + continue + } + util, err := strconv.ParseFloat(strings.TrimSpace(fields[1]), 64) + if err != nil { + log.Printf("Error parsing nvidia-smi output: %v", err) + continue + } memUsed, _ := strconv.ParseFloat(strings.TrimSpace(fields[2]), 64) memTotal, _ := strconv.ParseFloat(strings.TrimSpace(fields[3]), 64) temp, _ := strconv.ParseFloat(strings.TrimSpace(fields[4]), 64) @@ -396,6 +409,7 @@ func estimateGPUMemory() float64 { totalGB := float64(memSize) / (1024 * 1024 * 1024) // Estimate GPU memory as a fraction of system memory // This is a rough estimate and varies by GPU + // Actual GPU memory varies significantly and this should be treated as unreliable return totalGB * 0.1 // Assume 10% of system memory for GPU } } @@ -412,21 +426,8 @@ func getWindowsGpuData() ([]GpuData, error) { defer cancel() // PowerShell command to get GPU information - psCommand := ` - $gpus = Get-WmiObject -Class Win32_VideoController | Where-Object { $_.Name -notlike "*Basic*" -and $_.Name -notlike "*Standard*" } - $gpuInfo = @() - foreach ($gpu in $gpus) { - $gpuInfo += [PSCustomObject]@{ - Name = $gpu.Name - AdapterRAM = $gpu.AdapterRAM - VideoProcessor = $gpu.VideoProcessor - DriverVersion = $gpu.DriverVersion - } - } - $gpuInfo | ConvertTo-Json -Compress - ` - cmd := exec.CommandContext(ctx, "powershell", "-Command", psCommand) + cmd := exec.CommandContext(ctx, "powershell", "-Command", PS_GPU_COMMAND) output, err := cmd.Output() if err != nil { return nil, err @@ -460,7 +461,6 @@ func getWindowsGpuData() ([]GpuData, error) { Temp: 0, // Temperature requires additional tools on Windows } gpus = append(gpus, gpuData) - log.Printf("Found Windows GPU: %s (%.2f GB total, %.2f GB used)", gpu.Name, gpu.MemTotal, memUsed) } // If no GPUs found, create a default entry @@ -639,6 +639,9 @@ func getGpuData(values map[string]float64) { } if err != nil || len(gpus) == 0 { + if err != nil { + log.Printf("Error getting GPU data: %v", err) + } return } @@ -694,7 +697,7 @@ func generateSingleServerData(client *wshutil.WshRpc, connName string) { values := make(map[string]float64) getCpuData(values) getMemData(values) - getGpuData(values) // Add this line to get GPU data + getGpuData(values) tsData := wshrpc.TimeSeriesData{Ts: now.UnixMilli(), Values: values} event := wps.WaveEvent{ Event: wps.Event_SysInfo, From 1aebc5b0d26b6ea0dd1d0e19c43ab45398a3f5b2 Mon Sep 17 00:00:00 2001 From: Dawid Ciepiela <71898979+sarumaj@users.noreply.github.com> Date: Tue, 5 Aug 2025 22:22:30 +0200 Subject: [PATCH 4/5] get rid of estimations --- docs/GPU_MONITORING.md | 2 +- pkg/wshrpc/wshremote/sysinfo.go | 55 +++++++++------------------- pkg/wshrpc/wshremote/sysinfo_test.go | 4 -- 3 files changed, 19 insertions(+), 42 deletions(-) diff --git a/docs/GPU_MONITORING.md b/docs/GPU_MONITORING.md index 558458393b..aae7d03224 100644 --- a/docs/GPU_MONITORING.md +++ b/docs/GPU_MONITORING.md @@ -74,7 +74,7 @@ The frontend supports the following GPU-related plot types: ## Implementation Details ### Platform Detection -The system automatically detects the platform using `uname -s` and selects the appropriate GPU monitoring method. +The system automatically detects the platform using Go's `runtime.GOOS` and selects the appropriate GPU monitoring method. ### Tool Availability Detection Before attempting to collect GPU data, the system checks if the required tools (`nvidia-smi` or `rocm-smi`) are available on the system. diff --git a/pkg/wshrpc/wshremote/sysinfo.go b/pkg/wshrpc/wshremote/sysinfo.go index e0602f6eee..ba6a41cd45 100644 --- a/pkg/wshrpc/wshremote/sysinfo.go +++ b/pkg/wshrpc/wshremote/sysinfo.go @@ -143,10 +143,23 @@ func getRocmGpuData() ([]GpuData, error) { for _, match := range matches { if len(match) >= 5 { - index, _ := strconv.Atoi(match[1]) - memTotal, _ := strconv.ParseFloat(match[2], 64) - memUsed, _ := strconv.ParseFloat(match[3], 64) - temp, _ := strconv.ParseFloat(match[4], 64) + index, err := strconv.Atoi(match[1]) + if err != nil { + log.Printf("Error parsing rocm-smi output: %v", err) + continue + } + memTotal, err := strconv.ParseFloat(match[2], 64) + if err != nil { + log.Printf("Error parsing rocm-smi output: %v", err) + } + memUsed, err := strconv.ParseFloat(match[3], 64) + if err != nil { + log.Printf("Error parsing rocm-smi output: %v", err) + } + temp, err := strconv.ParseFloat(match[4], 64) + if err != nil { + log.Printf("Error parsing rocm-smi output: %v", err) + } // Convert MB to GB memTotal = memTotal / 1024 @@ -292,11 +305,6 @@ func getMacGpuMemory() GpuMemoryInfo { memInfo.Used = vmStat } - // If we couldn't get total VRAM, estimate based on system memory - if memInfo.Total == 0 { - memInfo.Total = estimateGPUMemory() - } - return memInfo } @@ -390,33 +398,6 @@ func getMemoryPressureFromVMStat() float64 { return float64(usedBytes) / (1024 * 1024 * 1024) // Convert to GB } -// Estimate GPU memory based on system memory -func estimateGPUMemory() float64 { - ctx, cancel := context.WithTimeout(context.Background(), 3*time.Second) - defer cancel() - - cmd := exec.CommandContext(ctx, "sysctl", "hw.memsize") - output, err := cmd.Output() - if err != nil { - return 4.0 // Default estimate - } - - // Parse total system memory - re := regexp.MustCompile(`hw\.memsize:\s+(\d+)`) - matches := re.FindStringSubmatch(string(output)) - if len(matches) >= 2 { - if memSize, err := strconv.ParseInt(matches[1], 10, 64); err == nil { - totalGB := float64(memSize) / (1024 * 1024 * 1024) - // Estimate GPU memory as a fraction of system memory - // This is a rough estimate and varies by GPU - // Actual GPU memory varies significantly and this should be treated as unreliable - return totalGB * 0.1 // Assume 10% of system memory for GPU - } - } - - return 4.0 // Default estimate -} - // Get GPU data for Windows using PowerShell commands func getWindowsGpuData() ([]GpuData, error) { var gpus []GpuData @@ -518,7 +499,7 @@ func parseWindowsGpuOutput(output string) []WindowsGpuInfo { gpuInfo := WindowsGpuInfo{ Name: name, MemTotal: memTotal, - MemUsed: 0, // Will be estimated based on utilization + MemUsed: 0, } gpuList = append(gpuList, gpuInfo) } diff --git a/pkg/wshrpc/wshremote/sysinfo_test.go b/pkg/wshrpc/wshremote/sysinfo_test.go index 8ea269332a..d583377e0f 100644 --- a/pkg/wshrpc/wshremote/sysinfo_test.go +++ b/pkg/wshrpc/wshremote/sysinfo_test.go @@ -70,10 +70,6 @@ func TestMacOSGpuFunctions(t *testing.T) { // Test memory pressure memPressure := getMemoryPressureFromVMStat() t.Logf("Memory pressure: %f GB", memPressure) - - // Test GPU memory estimation - estimatedMem := estimateGPUMemory() - t.Logf("Estimated GPU memory: %f GB", estimatedMem) } func TestWindowsGpuFunctions(t *testing.T) { From 14cfc5a9944f41083b7ba18c74cd407e67412873 Mon Sep 17 00:00:00 2001 From: Dawid Ciepiela <71898979+sarumaj@users.noreply.github.com> Date: Tue, 5 Aug 2025 22:25:49 +0200 Subject: [PATCH 5/5] review tests --- pkg/wshrpc/wshremote/sysinfo_test.go | 21 +++++++++++++++++++++ 1 file changed, 21 insertions(+) diff --git a/pkg/wshrpc/wshremote/sysinfo_test.go b/pkg/wshrpc/wshremote/sysinfo_test.go index d583377e0f..40b515123b 100644 --- a/pkg/wshrpc/wshremote/sysinfo_test.go +++ b/pkg/wshrpc/wshremote/sysinfo_test.go @@ -66,10 +66,16 @@ func TestMacOSGpuFunctions(t *testing.T) { // Test VRAM parsing vram := parseVRAMFromSystemProfiler(output) t.Logf("Parsed VRAM: %f GB", vram) + if vram != 1.5 { + t.Errorf("Expected VRAM to be 1536, got %f", vram) + } // Test memory pressure memPressure := getMemoryPressureFromVMStat() t.Logf("Memory pressure: %f GB", memPressure) + if memPressure != 0 { + t.Errorf("Expected memory pressure to be 0, got %f", memPressure) + } } func TestWindowsGpuFunctions(t *testing.T) { @@ -81,9 +87,24 @@ func TestWindowsGpuFunctions(t *testing.T) { for i, gpu := range gpuList { t.Logf("GPU %d: %s (%.2f GB)", i, gpu.Name, gpu.MemTotal) } + if len(gpuList) != 1 { + t.Errorf("Expected 1 GPU, got %d", len(gpuList)) + } + if gpuList[0].Name != "NVIDIA GeForce RTX 3080" { + t.Errorf("Expected GPU name to be NVIDIA GeForce RTX 3080, got %s", gpuList[0].Name) + } + if gpuList[0].MemTotal != 10 { + t.Errorf("Expected GPU memory total to be 10, got %f", gpuList[0].MemTotal) + } + if gpuList[0].MemUsed != 0 { + t.Errorf("Expected GPU memory used to be 0, got %f", gpuList[0].MemUsed) + } // Test Windows GPU memory output parsing memOutput := `{"NVIDIA GeForce RTX 3080":2.45}` memUsage := parseWindowsGpuMemoryOutput(memOutput) t.Logf("Memory usage: %v", memUsage) + if memUsage["NVIDIA GeForce RTX 3080"] != 2.45 { + t.Errorf("Expected GPU memory usage to be 2.45, got %f", memUsage["NVIDIA GeForce RTX 3080"]) + } }