diff --git a/src/btop_config.cpp b/src/btop_config.cpp index c9211771c..98fea6f61 100644 --- a/src/btop_config.cpp +++ b/src/btop_config.cpp @@ -104,7 +104,7 @@ namespace Config { {"update_ms", "#* Update time in milliseconds, recommended 2000 ms or above for better sample times for graphs."}, - {"proc_sorting", "#* Processes sorting, \"pid\" \"program\" \"arguments\" \"threads\" \"user\" \"memory\" \"cpu lazy\" \"cpu direct\",\n" + {"proc_sorting", "#* Processes sorting, \"pid\" \"program\" \"arguments\" \"threads\" \"user\" \"memory\" \"gpu\" \"gpu memory\" \"cpu lazy\" \"cpu direct\",\n" "#* \"cpu lazy\" sorts top process over time (easier to follow), \"cpu direct\" updates top process directly."}, {"proc_reversed", "#* Reverse sorting order, True or False."}, @@ -121,6 +121,10 @@ namespace Config { {"proc_cpu_graphs", "#* Show cpu graph for each process."}, + {"proc_gpu_graphs", "#* Show gpu graph for each process."}, + + {"proc_gpu_only", "#* Show only processes with active GPU usage or GPU memory allocation in the process list."}, + {"proc_info_smaps", "#* Use /proc/[pid]/smaps for memory information in the process info box (very slow but more accurate)"}, {"proc_left", "#* Show proc box on left side of screen instead of right."}, @@ -304,6 +308,8 @@ namespace Config { {"proc_per_core", false}, {"proc_mem_bytes", true}, {"proc_cpu_graphs", true}, + {"proc_gpu_graphs", true}, + {"proc_gpu_only", false}, {"proc_info_smaps", false}, {"proc_left", false}, {"proc_filter_kernel", false}, diff --git a/src/btop_draw.cpp b/src/btop_draw.cpp index 5f19e4b7d..93d72c137 100644 --- a/src/btop_draw.cpp +++ b/src/btop_draw.cpp @@ -1577,8 +1577,10 @@ namespace Proc { int scroll_pos; string selected_name; std::unordered_map p_graphs; + std::unordered_map p_graphs_gpu; std::unordered_map p_wide_cmd; std::unordered_map p_counters; + std::unordered_map p_counters_gpu; int counter = 0; Draw::TextEdit filter; Draw::Graph detailed_cpu_graph; @@ -1681,7 +1683,15 @@ namespace Proc { auto& graph_bg = Symbols::graph_symbols.at((graph_symbol == "default" ? Config::getS("graph_symbol") + "_up" : graph_symbol + "_up")).at(6); auto mem_bytes = Config::getB("proc_mem_bytes"); auto vim_keys = Config::getB("vim_keys"); - auto show_graphs = Config::getB("proc_cpu_graphs"); + auto show_cpu_graphs = Config::getB("proc_cpu_graphs"); + auto show_gpu_graphs = Config::getB("proc_gpu_graphs"); + #if defined(__linux__) + const bool show_gpu = width >= ((show_cpu_graphs or show_gpu_graphs) ? 70 : 65); + const bool show_gpu_mem = width >= ((show_cpu_graphs or show_gpu_graphs) ? 78 : 73); + #else + const bool show_gpu = false; + const bool show_gpu_mem = false; + #endif const auto pause_proc_list = Config::getB("pause_proc_list"); auto follow_process = Config::getB("follow_process"); int followed_pid = Config::getI("followed_pid"); @@ -1772,10 +1782,14 @@ namespace Proc { //? Adapt sizes of text fields user_size = (width < 75 ? 5 : 10); thread_size = (width < 75 ? - 1 : 4); - prog_size = (width > 70 ? 16 : ( width > 55 ? 8 : width - user_size - thread_size - 33)); - cmd_size = (width > 55 ? width - prog_size - user_size - thread_size - 33 : -1); - tree_size = width - user_size - thread_size - 23; - if (not show_graphs) { + const int gpu_cols = (show_gpu_mem ? 6 : 0) + (show_gpu ? 6 : 0); + const int gpu_graph_cols = (show_gpu and show_gpu_graphs ? 5 : 0); + const int proc_fixed = 33 + gpu_cols + gpu_graph_cols; + const int tree_fixed = 23 + gpu_cols + gpu_graph_cols; + prog_size = (width > 70 ? 16 : ( width > 55 ? 8 : width - user_size - thread_size - proc_fixed)); + cmd_size = (width > 55 ? width - prog_size - user_size - thread_size - proc_fixed : -1); + tree_size = width - user_size - thread_size - tree_fixed; + if (not show_cpu_graphs) { cmd_size += 5; tree_size += 5; } @@ -1880,15 +1894,24 @@ namespace Proc { //? pause, per-core, reverse, tree and sorting const auto& sorting = Config::getS("proc_sorting"); + const auto gpu_only = Config::getB("proc_gpu_only"); const int sort_len = sorting.size(); const int sort_pos = x + width - sort_len - 8; + if (width > 70 + sort_len) { + fmt::format_to(std::back_inserter(out), "{}{}{}{}{}{}{}{}{}{}{}", + Mv::to(y, sort_pos - 43), title_left, gpu_only ? Fx::b : "", + Theme::c("hi_fg"), 'g', Theme::c("title"), "pu-", + Theme::c("title"), "only", + Fx::ub, title_right); + Input::mouse_mappings["g"] = {y, sort_pos - 42, 1, 8}; + } if (width > 60 + sort_len) { - fmt::format_to(std::back_inserter(out), "{}{}{}{}{}{}{}{}{}{}{}", + fmt::format_to(std::back_inserter(out), "{}{}{}{}{}{}{}{}{}{}{}", Mv::to(y, sort_pos - 32), title_left, pause_proc_list ? Fx::b : "", Theme::c("title"), "pa", Theme::c("hi_fg"), 'u', Theme::c("title"), "se", - Fx::ub, title_right); - Input::mouse_mappings["u"] = {y, sort_pos - 31, 1, 5}; + Fx::ub, title_right); + Input::mouse_mappings["u"] = {y, sort_pos - 31, 1, 5}; } if (width > 55 + sort_len) { out += Mv::to(y, sort_pos - 25) + title_left + (Config::getB("proc_per_core") ? Fx::b : "") + Theme::c("title") @@ -1958,7 +1981,10 @@ namespace Proc { out += (thread_size > 0 ? Mv::l(4) + "Threads: " : "") + ljust("User:", user_size) + ' ' + rjust((mem_bytes ? "MemB" : "Mem%"), 5) + ' ' - + rjust("Cpu%", (show_graphs ? 10 : 5)) + Fx::ub; + + rjust("Cpu%", (show_cpu_graphs ? 10 : 5)) + + (show_gpu_mem ? string{" "} + rjust("GMem", 5) : "") + + (show_gpu ? string{" "} + rjust("Gpu%", (show_gpu_graphs ? 10 : 5)) : "") + + Fx::ub; } //* End of redraw block @@ -2021,9 +2047,9 @@ namespace Proc { selected_depth = p.depth; } - //? Update graphs for processes with above 0.0% cpu usage, delete if below 0.1% 10x times - bool has_graph = show_graphs ? p_counters.contains(p.pid) : false; - if (show_graphs and ((p.cpu_p > 0 and not has_graph) or (not data_same and has_graph))) { + //? Update cpu graphs for processes with above 0.0% cpu usage, delete if below 0.1% 10x times + bool has_graph = show_cpu_graphs ? p_counters.contains(p.pid) : false; + if (show_cpu_graphs and ((p.cpu_p > 0 and not has_graph) or (not data_same and has_graph))) { if (not has_graph) { p_graphs[p.pid] = Draw::Graph{5, 1, "", {}, graph_symbol}; p_counters[p.pid] = 0; @@ -2036,6 +2062,21 @@ namespace Proc { p_counters[p.pid] = 0; } + //? Update gpu graphs for processes with above 0.0% gpu usage, delete if below 0.1% 10x times + bool has_gpu_graph = (show_gpu and show_gpu_graphs) ? p_counters_gpu.contains(p.pid) : false; + if ((show_gpu and show_gpu_graphs) and ((p.gpu_p > 0 and not has_gpu_graph) or (not data_same and has_gpu_graph))) { + if (not has_gpu_graph) { + p_graphs_gpu[p.pid] = Draw::Graph{5, 1, "", {}, graph_symbol}; + p_counters_gpu[p.pid] = 0; + } + else if (p.gpu_p < 0.1 and ++p_counters_gpu[p.pid] >= 10) { + if (p_graphs_gpu.contains(p.pid)) p_graphs_gpu.erase(p.pid); + p_counters_gpu.erase(p.pid); + } + else + p_counters_gpu[p.pid] = 0; + } + out += Fx::reset; //? Set correct gradient colors if enabled @@ -2110,6 +2151,18 @@ namespace Proc { if (cpu_str.ends_with('.')) cpu_str.pop_back(); cpu_str += "k"; } + string gpu_str; + if (show_gpu) { + if (p.gpu_p <= 0.0 and p.gpu_m == 0) { + gpu_str = "-"; + } + else { + gpu_str = fmt::format("{:.1f}", clamp(p.gpu_p, 0.0, 100.0)); + if (gpu_str.size() > 4) gpu_str.resize(4); + if (gpu_str.ends_with('.')) gpu_str.pop_back(); + } + } + const string gpu_mem_str = show_gpu_mem ? (p.gpu_m == 0 ? "-" : floating_humanizer(p.gpu_m, true)) : ""; string mem_str = (mem_bytes ? floating_humanizer(p.mem, true) : ""); if (not mem_bytes) { double mem_p = clamp((double)p.mem * 100 / totalMem, 0.0, 100.0); @@ -2131,9 +2184,13 @@ namespace Proc { out += (thread_size > 0 ? t_color + rjust(proc_threads_string, thread_size) + ' ' + end : "" ) + g_color + ljust((cmp_greater(p.user.size(), user_size) ? p.user.substr(0, user_size - 1) + '+' : p.user), user_size) + ' ' + m_color + rjust(mem_str, 5) + end + ' ' - + (is_selected or is_followed ? "" : Theme::c("inactive_fg")) + (show_graphs ? graph_bg * 5: "") + + (is_selected or is_followed ? "" : Theme::c("inactive_fg")) + (show_cpu_graphs ? graph_bg * 5: "") + (p_graphs.contains(p.pid) ? Mv::l(5) + c_color + p_graphs.at(p.pid)({(p.cpu_p >= 0.1 and p.cpu_p < 5 ? 5ll : (long long)round(p.cpu_p))}, data_same) : "") + end + ' ' - + c_color + rjust(cpu_str, 4) + " " + end; + + c_color + rjust(cpu_str, 4) + ' ' + end + + (show_gpu_mem ? c_color + rjust(gpu_mem_str, 5) + ' ' + end : "") + + (is_selected or is_followed ? "" : Theme::c("inactive_fg")) + ((show_gpu and show_gpu_graphs) ? graph_bg * 5 : "") + + (p_graphs_gpu.contains(p.pid) ? Mv::l(5) + c_color + p_graphs_gpu.at(p.pid)({(p.gpu_p >= 0.1 and p.gpu_p < 5 ? 5ll : (long long)round(p.gpu_p))}, data_same) : "") + end + + (show_gpu ? c_color + rjust(gpu_str, 5) + ' ' + end : ""); if (lc++ > height - 5) break; else if (lc > height - 5 and proc_banner_shown) break; } @@ -2182,6 +2239,14 @@ namespace Proc { return rng::find(plist, pair.first, &proc_info::pid) == plist.end(); }); + std::erase_if(p_graphs_gpu, [&](const auto& pair) { + return rng::find(plist, pair.first, &proc_info::pid) == plist.end(); + }); + + std::erase_if(p_counters_gpu, [&](const auto& pair) { + return rng::find(plist, pair.first, &proc_info::pid) == plist.end(); + }); + std::erase_if(p_wide_cmd, [&](const auto& pair) { return rng::find(plist, pair.first, &proc_info::pid) == plist.end(); }); @@ -2230,7 +2295,9 @@ namespace Draw { Runner::redraw = true; if (not (Proc::resized or Global::resized)) { Proc::p_counters.clear(); + Proc::p_counters_gpu.clear(); Proc::p_graphs.clear(); + Proc::p_graphs_gpu.clear(); } if (Menu::active) Menu::redraw = true; diff --git a/src/btop_input.cpp b/src/btop_input.cpp index 28a966773..245bb0664 100644 --- a/src/btop_input.cpp +++ b/src/btop_input.cpp @@ -377,6 +377,11 @@ namespace Input { else if (key == "c") Config::flip("proc_per_core"); + else if (key == "g") { + Config::flip("proc_gpu_only"); + Config::set("update_following", true); + } + else if (key == "%") Config::flip("proc_mem_bytes"); diff --git a/src/btop_menu.cpp b/src/btop_menu.cpp index fbaadf32d..897baf084 100644 --- a/src/btop_menu.cpp +++ b/src/btop_menu.cpp @@ -201,6 +201,7 @@ namespace Menu { {"f, /", "To enter a process filter. Start with ! for regex."}, {"F", "Follow selected process."}, {"u", "Pause process list."}, + {"g", "Toggle GPU-only process filter."}, {"delete", "Clear any entered filter."}, {"c", "Toggle per-core cpu usage of processes."}, {"r", "Reverse sorting order in processes box."}, @@ -817,7 +818,8 @@ namespace Menu { "", "Possible values:", "\"pid\", \"program\", \"arguments\", \"threads\",", - "\"user\", \"memory\", \"cpu lazy\" and", + "\"user\", \"memory\", \"gpu\",", + "\"gpu memory\", \"cpu lazy\" and", "\"cpu direct\".", "", "\"cpu lazy\" updates top process over time.", @@ -872,6 +874,16 @@ namespace Menu { "Show cpu graph for each process.", "", "True or False"}, + {"proc_gpu_graphs", + "Show gpu graph for each process.", + "", + "True or False"}, + {"proc_gpu_only", + "Show only GPU-active processes.", + "", + "When enabled, only processes with", + "non-zero GPU usage or GPU memory", + "allocation are shown."}, {"proc_filter_kernel", "(Linux) Filter kernel processes from output.", "", diff --git a/src/btop_shared.cpp b/src/btop_shared.cpp index 0990fe681..3c8526a7d 100644 --- a/src/btop_shared.cpp +++ b/src/btop_shared.cpp @@ -109,8 +109,10 @@ bool set_priority(pid_t pid, int priority) { case 3: rng::stable_sort(proc_vec, rng::less{}, &proc_info::threads); break; case 4: rng::stable_sort(proc_vec, rng::greater{}, &proc_info::user); break; case 5: rng::stable_sort(proc_vec, rng::less{}, &proc_info::mem); break; - case 6: rng::stable_sort(proc_vec, rng::less{}, &proc_info::cpu_p); break; - case 7: rng::stable_sort(proc_vec, rng::less{}, &proc_info::cpu_c); break; + case 6: rng::stable_sort(proc_vec, rng::less{}, &proc_info::gpu_p); break; + case 7: rng::stable_sort(proc_vec, rng::less{}, &proc_info::gpu_m); break; + case 8: rng::stable_sort(proc_vec, rng::less{}, &proc_info::cpu_p); break; + case 9: rng::stable_sort(proc_vec, rng::less{}, &proc_info::cpu_c); break; } } else { @@ -121,8 +123,10 @@ bool set_priority(pid_t pid, int priority) { case 3: rng::stable_sort(proc_vec, rng::greater{}, &proc_info::threads); break; case 4: rng::stable_sort(proc_vec, rng::less{}, &proc_info::user); break; case 5: rng::stable_sort(proc_vec, rng::greater{}, &proc_info::mem); break; - case 6: rng::stable_sort(proc_vec, rng::greater{}, &proc_info::cpu_p); break; - case 7: rng::stable_sort(proc_vec, rng::greater{}, &proc_info::cpu_c); break; + case 6: rng::stable_sort(proc_vec, rng::greater{}, &proc_info::gpu_p); break; + case 7: rng::stable_sort(proc_vec, rng::greater{}, &proc_info::gpu_m); break; + case 8: rng::stable_sort(proc_vec, rng::greater{}, &proc_info::cpu_p); break; + case 9: rng::stable_sort(proc_vec, rng::greater{}, &proc_info::cpu_c); break; } } @@ -150,16 +154,20 @@ bool set_priority(pid_t pid, int priority) { switch (v_index(sort_vector, sorting)) { case 3: rng::stable_sort(proc_vec, [](const auto& a, const auto& b) { return a.entry.get().threads < b.entry.get().threads; }); break; case 5: rng::stable_sort(proc_vec, [](const auto& a, const auto& b) { return a.entry.get().mem < b.entry.get().mem; }); break; - case 6: rng::stable_sort(proc_vec, [](const auto& a, const auto& b) { return a.entry.get().cpu_p < b.entry.get().cpu_p; }); break; - case 7: rng::stable_sort(proc_vec, [](const auto& a, const auto& b) { return a.entry.get().cpu_c < b.entry.get().cpu_c; }); break; + case 6: rng::stable_sort(proc_vec, [](const auto& a, const auto& b) { return a.entry.get().gpu_p < b.entry.get().gpu_p; }); break; + case 7: rng::stable_sort(proc_vec, [](const auto& a, const auto& b) { return a.entry.get().gpu_m < b.entry.get().gpu_m; }); break; + case 8: rng::stable_sort(proc_vec, [](const auto& a, const auto& b) { return a.entry.get().cpu_p < b.entry.get().cpu_p; }); break; + case 9: rng::stable_sort(proc_vec, [](const auto& a, const auto& b) { return a.entry.get().cpu_c < b.entry.get().cpu_c; }); break; } } else { switch (v_index(sort_vector, sorting)) { case 3: rng::stable_sort(proc_vec, [](const auto& a, const auto& b) { return a.entry.get().threads > b.entry.get().threads; }); break; case 5: rng::stable_sort(proc_vec, [](const auto& a, const auto& b) { return a.entry.get().mem > b.entry.get().mem; }); break; - case 6: rng::stable_sort(proc_vec, [](const auto& a, const auto& b) { return a.entry.get().cpu_p > b.entry.get().cpu_p; }); break; - case 7: rng::stable_sort(proc_vec, [](const auto& a, const auto& b) { return a.entry.get().cpu_c > b.entry.get().cpu_c; }); break; + case 6: rng::stable_sort(proc_vec, [](const auto& a, const auto& b) { return a.entry.get().gpu_p > b.entry.get().gpu_p; }); break; + case 7: rng::stable_sort(proc_vec, [](const auto& a, const auto& b) { return a.entry.get().gpu_m > b.entry.get().gpu_m; }); break; + case 8: rng::stable_sort(proc_vec, [](const auto& a, const auto& b) { return a.entry.get().cpu_p > b.entry.get().cpu_p; }); break; + case 9: rng::stable_sort(proc_vec, [](const auto& a, const auto& b) { return a.entry.get().cpu_c > b.entry.get().cpu_c; }); break; } } } @@ -173,6 +181,10 @@ bool set_priority(pid_t pid, int priority) { } auto matches_filter(const proc_info& proc, const std::string& filter) -> bool { + if (Config::getB("proc_gpu_only") and proc.gpu_p <= 0.0 and proc.gpu_m == 0) { + return false; + } + if (filter.starts_with("!")) { if (filter.size() == 1) { return true; @@ -242,6 +254,8 @@ bool set_priority(pid_t pid, int priority) { if (p.state != 'X') { cur_proc.cpu_p += p.cpu_p; cur_proc.cpu_c += p.cpu_c; + cur_proc.gpu_p += p.gpu_p; + cur_proc.gpu_m += p.gpu_m; cur_proc.mem += p.mem; cur_proc.threads += p.threads; } @@ -251,6 +265,8 @@ bool set_priority(pid_t pid, int priority) { else if (Config::getB("proc_aggregate") and p.state != 'X') { cur_proc.cpu_p += p.cpu_p; cur_proc.cpu_c += p.cpu_c; + cur_proc.gpu_p += p.gpu_p; + cur_proc.gpu_m += p.gpu_m; cur_proc.mem += p.mem; cur_proc.threads += p.threads; } diff --git a/src/btop_shared.hpp b/src/btop_shared.hpp index 96f4de32a..cc40182f1 100644 --- a/src/btop_shared.hpp +++ b/src/btop_shared.hpp @@ -373,6 +373,8 @@ namespace Proc { "threads", "user", "memory", + "gpu", + "gpu memory", "cpu direct", "cpu lazy", }; @@ -404,11 +406,14 @@ namespace Proc { uint64_t mem{}; double cpu_p{}; // defaults to = 0.0 double cpu_c{}; // defaults to = 0.0 + double gpu_p{}; // defaults to = 0.0 + uint64_t gpu_m{}; char state = '0'; int64_t p_nice{}; uint64_t ppid{}; uint64_t cpu_s{}; uint64_t cpu_t{}; + uint64_t gpu_t{}; uint64_t death_time{}; string prefix{}; // defaults to "" size_t depth{}; diff --git a/src/linux/btop_collect.cpp b/src/linux/btop_collect.cpp index d8ab93c4b..574066ecc 100644 --- a/src/linux/btop_collect.cpp +++ b/src/linux/btop_collect.cpp @@ -110,6 +110,20 @@ long long get_monotonicTimeUSec() return time.tv_sec * 1000000 + time.tv_nsec / 1000; } +auto trim_view(std::string_view value) -> std::string_view { + while (not value.empty() and is_in(value.front(), ' ', '\t')) value.remove_prefix(1); + while (not value.empty() and is_in(value.back(), ' ', '\t', '\r')) value.remove_suffix(1); + return value; +} + +auto parse_u64_prefix(std::string_view value, uint64_t& out) -> bool { + value = trim_view(value); + if (value.empty()) return false; + + auto [ptr, err] = std::from_chars(value.data(), value.data() + value.size(), out); + return err == std::errc{} and ptr != value.data(); +} + } namespace Cpu { @@ -149,16 +163,19 @@ namespace Cpu { namespace Gpu { vector gpus; //? NVIDIA data collection - namespace Nvml { - //? NVML defines, structs & typedefs - #define NVML_DEVICE_NAME_BUFFER_SIZE 64 - #define NVML_SUCCESS 0 - #define NVML_TEMPERATURE_THRESHOLD_SHUTDOWN 0 - #define NVML_CLOCK_GRAPHICS 0 - #define NVML_CLOCK_MEM 2 - #define NVML_TEMPERATURE_GPU 0 - #define NVML_PCIE_UTIL_TX_BYTES 0 - #define NVML_PCIE_UTIL_RX_BYTES 1 + namespace Nvml { + //? NVML defines, structs & typedefs + #define NVML_DEVICE_NAME_BUFFER_SIZE 64 + #define NVML_SUCCESS 0 + #define NVML_ERROR_NOT_FOUND 6 + #define NVML_ERROR_INSUFFICIENT_SIZE 7 + #define NVML_TEMPERATURE_THRESHOLD_SHUTDOWN 0 + #define NVML_CLOCK_GRAPHICS 0 + #define NVML_CLOCK_MEM 2 + #define NVML_TEMPERATURE_GPU 0 + #define NVML_PCIE_UTIL_TX_BYTES 0 + #define NVML_PCIE_UTIL_RX_BYTES 1 + #define NVML_VALUE_NOT_AVAILABLE_ULL (std::numeric_limits::max()) typedef void* nvmlDevice_t; // we won't be accessing any of the underlying struct's properties, so this is fine typedef int nvmlReturn_t, // enums are basically ints @@ -168,8 +185,37 @@ namespace Gpu { nvmlTemperatureSensors_t, nvmlPcieUtilCounter_t; - struct nvmlUtilization_t {unsigned int gpu, memory;}; - struct nvmlMemory_t {unsigned long long total, free, used;}; + struct nvmlUtilization_t {unsigned int gpu, memory;}; + struct nvmlMemory_t {unsigned long long total, free, used;}; + struct nvmlProcessUtilizationSample_t { + unsigned int pid; + unsigned long long timeStamp; + unsigned int smUtil; + unsigned int memUtil; + unsigned int encUtil; + unsigned int decUtil; + }; + struct nvmlProcessInfo_t { + unsigned int pid; + unsigned long long usedGpuMemory; + unsigned int gpuInstanceId; + unsigned int computeInstanceId; + unsigned long long usedGpuCcProtectedMemory; + }; + struct nvmlProcessInfo_v2_t { + unsigned int pid; + unsigned long long usedGpuMemory; + unsigned int gpuInstanceId; + unsigned int computeInstanceId; + }; + struct nvmlProcessInfo_v1_t { + unsigned int pid; + unsigned long long usedGpuMemory; + }; + struct proc_stat { + double gpu{}; + uint64_t mem{}; + }; //? Function pointers const char* (*nvmlErrorString)(nvmlReturn_t); @@ -186,19 +232,32 @@ namespace Gpu { nvmlReturn_t (*nvmlDeviceGetPowerState)(nvmlDevice_t, nvmlPstates_t*); nvmlReturn_t (*nvmlDeviceGetTemperature)(nvmlDevice_t, nvmlTemperatureSensors_t, unsigned int*); nvmlReturn_t (*nvmlDeviceGetMemoryInfo)(nvmlDevice_t, nvmlMemory_t*); - nvmlReturn_t (*nvmlDeviceGetPcieThroughput)(nvmlDevice_t, nvmlPcieUtilCounter_t, unsigned int*); - nvmlReturn_t (*nvmlDeviceGetEncoderUtilization)(nvmlDevice_t, unsigned int*, unsigned int*); - nvmlReturn_t (*nvmlDeviceGetDecoderUtilization)(nvmlDevice_t, unsigned int*, unsigned int*); - - //? Data - void* nvml_dl_handle; - bool initialized = false; - bool init(); - bool shutdown(); - template bool collect(gpu_info* gpus_slice); - vector devices; - unsigned int device_count = 0; - } + nvmlReturn_t (*nvmlDeviceGetPcieThroughput)(nvmlDevice_t, nvmlPcieUtilCounter_t, unsigned int*); + nvmlReturn_t (*nvmlDeviceGetEncoderUtilization)(nvmlDevice_t, unsigned int*, unsigned int*); + nvmlReturn_t (*nvmlDeviceGetDecoderUtilization)(nvmlDevice_t, unsigned int*, unsigned int*); + nvmlReturn_t (*nvmlDeviceGetProcessUtilization)(nvmlDevice_t, nvmlProcessUtilizationSample_t*, unsigned int*, unsigned long long); + nvmlReturn_t (*nvmlDeviceGetGraphicsRunningProcesses_v1)(nvmlDevice_t, unsigned int*, nvmlProcessInfo_v1_t*); + nvmlReturn_t (*nvmlDeviceGetGraphicsRunningProcesses_v2)(nvmlDevice_t, unsigned int*, nvmlProcessInfo_v2_t*); + nvmlReturn_t (*nvmlDeviceGetGraphicsRunningProcesses_v3)(nvmlDevice_t, unsigned int*, nvmlProcessInfo_t*); + nvmlReturn_t (*nvmlDeviceGetComputeRunningProcesses_v1)(nvmlDevice_t, unsigned int*, nvmlProcessInfo_v1_t*); + nvmlReturn_t (*nvmlDeviceGetComputeRunningProcesses_v2)(nvmlDevice_t, unsigned int*, nvmlProcessInfo_v2_t*); + nvmlReturn_t (*nvmlDeviceGetComputeRunningProcesses_v3)(nvmlDevice_t, unsigned int*, nvmlProcessInfo_t*); + + //? Data + void* nvml_dl_handle; + bool initialized = false; + bool process_memory_functions_available = false; + bool process_utilization_function_available = false; + bool init(); + bool shutdown(); + template bool collect(gpu_info* gpus_slice); + bool collect_process_stats(); + auto get_process_stats() -> const std::unordered_map&; + vector devices; + vector process_last_timestamps; + std::unordered_map process_stats; + unsigned int device_count = 0; + } //? AMD data collection namespace Rsmi { @@ -1219,6 +1278,7 @@ namespace Gpu { } auto load_nvml_sym = [&](const char sym_name[]) { + dlerror(); auto sym = dlsym(nvml_dl_handle, sym_name); auto err = dlerror(); if (err != nullptr) { @@ -1227,6 +1287,13 @@ namespace Gpu { } else return sym; }; + auto try_load_nvml_sym = [&](const char sym_name[]) { + dlerror(); + auto sym = dlsym(nvml_dl_handle, sym_name); + (void)dlerror(); + return sym; + }; + #define LOAD_SYM(NAME) if ((NAME = (decltype(NAME))load_nvml_sym(#NAME)) == nullptr) return false LOAD_SYM(nvmlErrorString); @@ -1246,6 +1313,21 @@ namespace Gpu { LOAD_SYM(nvmlDeviceGetPcieThroughput); LOAD_SYM(nvmlDeviceGetEncoderUtilization); LOAD_SYM(nvmlDeviceGetDecoderUtilization); + nvmlDeviceGetProcessUtilization = (decltype(nvmlDeviceGetProcessUtilization))try_load_nvml_sym("nvmlDeviceGetProcessUtilization"); + nvmlDeviceGetGraphicsRunningProcesses_v1 = (decltype(nvmlDeviceGetGraphicsRunningProcesses_v1))try_load_nvml_sym("nvmlDeviceGetGraphicsRunningProcesses"); + nvmlDeviceGetGraphicsRunningProcesses_v2 = (decltype(nvmlDeviceGetGraphicsRunningProcesses_v2))try_load_nvml_sym("nvmlDeviceGetGraphicsRunningProcesses_v2"); + nvmlDeviceGetGraphicsRunningProcesses_v3 = (decltype(nvmlDeviceGetGraphicsRunningProcesses_v3))try_load_nvml_sym("nvmlDeviceGetGraphicsRunningProcesses_v3"); + nvmlDeviceGetComputeRunningProcesses_v1 = (decltype(nvmlDeviceGetComputeRunningProcesses_v1))try_load_nvml_sym("nvmlDeviceGetComputeRunningProcesses"); + nvmlDeviceGetComputeRunningProcesses_v2 = (decltype(nvmlDeviceGetComputeRunningProcesses_v2))try_load_nvml_sym("nvmlDeviceGetComputeRunningProcesses_v2"); + nvmlDeviceGetComputeRunningProcesses_v3 = (decltype(nvmlDeviceGetComputeRunningProcesses_v3))try_load_nvml_sym("nvmlDeviceGetComputeRunningProcesses_v3"); + process_utilization_function_available = nvmlDeviceGetProcessUtilization != nullptr; + process_memory_functions_available = + nvmlDeviceGetGraphicsRunningProcesses_v3 != nullptr or + nvmlDeviceGetGraphicsRunningProcesses_v2 != nullptr or + nvmlDeviceGetGraphicsRunningProcesses_v1 != nullptr or + nvmlDeviceGetComputeRunningProcesses_v3 != nullptr or + nvmlDeviceGetComputeRunningProcesses_v2 != nullptr or + nvmlDeviceGetComputeRunningProcesses_v1 != nullptr; #undef LOAD_SYM @@ -1265,6 +1347,8 @@ namespace Gpu { if (device_count > 0) { devices.resize(device_count); + process_last_timestamps.assign(device_count, 0ULL); + process_stats.clear(); gpus.resize(device_count); gpu_names.resize(device_count); @@ -1475,17 +1559,6 @@ namespace Gpu { } else gpus_slice[i].decoder_utilization = (long long)utilization; } - //? TODO: Processes using GPU - /*unsigned int proc_info_len; - nvmlProcessInfo_t* proc_info = 0; - result = nvmlDeviceGetComputeRunningProcesses_v3(device, &proc_info_len, proc_info); - if (result != NVML_SUCCESS) { - Logger::warning("NVML: Failed to get compute processes: {}", nvmlErrorString(result)); - } else { - for (unsigned int i = 0; i < proc_info_len; ++i) - gpus_slice[i].graphics_processes.push_back({proc_info[i].pid, proc_info[i].usedGpuMemory}); - }*/ - // nvTimer.stop_rename_reset("Nv pcie thread join"); //? Join PCIE TX/RX threads if constexpr(is_init) { // there doesn't seem to be a better way to do this, but this should be fine considering it's just 2 lines @@ -1499,6 +1572,123 @@ namespace Gpu { return true; } + + bool collect_process_stats() { + if (not initialized or (not process_memory_functions_available and not process_utilization_function_available)) { + process_stats.clear(); + return false; + } + + process_stats.clear(); + + auto merge_process_memory = [&](unsigned int pid, unsigned long long used_mem, std::unordered_map& mem_by_pid) { + if (used_mem == NVML_VALUE_NOT_AVAILABLE_ULL) return; + auto& mem = mem_by_pid[pid]; + mem = max(mem, (uint64_t)used_mem); + }; + + auto append_process_memory_v3 = [&](nvmlDevice_t device, auto fn, std::unordered_map& mem_by_pid) { + if (fn == nullptr) return; + unsigned int proc_count = 0; + auto result = fn(device, &proc_count, nullptr); + if (result == NVML_ERROR_NOT_FOUND) return; + if (result != NVML_SUCCESS and result != NVML_ERROR_INSUFFICIENT_SIZE) return; + if (proc_count == 0) return; + + std::vector processes(proc_count); + result = fn(device, &proc_count, processes.data()); + if (result != NVML_SUCCESS) return; + + for (unsigned int n = 0; n < proc_count; ++n) { + merge_process_memory(processes[n].pid, processes[n].usedGpuMemory, mem_by_pid); + } + }; + + auto append_process_memory_v2 = [&](nvmlDevice_t device, auto fn, std::unordered_map& mem_by_pid) { + if (fn == nullptr) return; + unsigned int proc_count = 0; + auto result = fn(device, &proc_count, nullptr); + if (result == NVML_ERROR_NOT_FOUND) return; + if (result != NVML_SUCCESS and result != NVML_ERROR_INSUFFICIENT_SIZE) return; + if (proc_count == 0) return; + + std::vector processes(proc_count); + result = fn(device, &proc_count, processes.data()); + if (result != NVML_SUCCESS) return; + + for (unsigned int n = 0; n < proc_count; ++n) { + merge_process_memory(processes[n].pid, processes[n].usedGpuMemory, mem_by_pid); + } + }; + + auto append_process_memory_v1 = [&](nvmlDevice_t device, auto fn, std::unordered_map& mem_by_pid) { + if (fn == nullptr) return; + unsigned int proc_count = 0; + auto result = fn(device, &proc_count, nullptr); + if (result == NVML_ERROR_NOT_FOUND) return; + if (result != NVML_SUCCESS and result != NVML_ERROR_INSUFFICIENT_SIZE) return; + if (proc_count == 0) return; + + std::vector processes(proc_count); + result = fn(device, &proc_count, processes.data()); + if (result != NVML_SUCCESS) return; + + for (unsigned int n = 0; n < proc_count; ++n) { + merge_process_memory(processes[n].pid, processes[n].usedGpuMemory, mem_by_pid); + } + }; + + for (unsigned int i = 0; i < device_count; ++i) { + // Merge per-device process stats from whichever NVML API versions are available. + if (process_memory_functions_available) { + std::unordered_map mem_by_pid; + append_process_memory_v3(devices[i], nvmlDeviceGetGraphicsRunningProcesses_v3, mem_by_pid); + append_process_memory_v3(devices[i], nvmlDeviceGetComputeRunningProcesses_v3, mem_by_pid); + append_process_memory_v2(devices[i], nvmlDeviceGetGraphicsRunningProcesses_v2, mem_by_pid); + append_process_memory_v2(devices[i], nvmlDeviceGetComputeRunningProcesses_v2, mem_by_pid); + append_process_memory_v1(devices[i], nvmlDeviceGetGraphicsRunningProcesses_v1, mem_by_pid); + append_process_memory_v1(devices[i], nvmlDeviceGetComputeRunningProcesses_v1, mem_by_pid); + + for (const auto& [pid, mem] : mem_by_pid) { + process_stats[pid].mem += mem; + } + } + + if (process_utilization_function_available) { + unsigned int sample_count = 64; + std::vector samples(sample_count); + auto result = nvmlDeviceGetProcessUtilization(devices[i], samples.data(), &sample_count, process_last_timestamps.at(i)); + + if (result == NVML_ERROR_INSUFFICIENT_SIZE and sample_count > 0) { + samples.resize(sample_count); + result = nvmlDeviceGetProcessUtilization(devices[i], samples.data(), &sample_count, process_last_timestamps.at(i)); + } + if (result != NVML_SUCCESS) continue; + + std::unordered_map util_by_pid; + for (unsigned int n = 0; n < sample_count; ++n) { + const auto& sample = samples[n]; + process_last_timestamps.at(i) = max(process_last_timestamps.at(i), sample.timeStamp); + auto& util = util_by_pid[sample.pid]; + util = max(util, sample.smUtil); + } + + for (const auto& [pid, util] : util_by_pid) { + process_stats[pid].gpu += util; + } + } + } + + for (auto& [_, stat] : process_stats) { + stat.gpu = clamp(stat.gpu, 0.0, 100.0); + } + + return true; + } + + auto get_process_stats() -> const std::unordered_map& { + return process_stats; + } } //? AMD @@ -2799,6 +2989,7 @@ namespace Proc { string current_sort; string current_filter; bool current_rev{}; + bool current_gpu_only{}; bool is_tree_mode; fs::file_time_type passwd_time; @@ -2806,6 +2997,7 @@ namespace Proc { uint64_t cputimes; int collapse = -1, expand = -1, toggle_children = -1; uint64_t old_cputimes{}; + uint64_t old_gputimes{}; atomic numpids{}; int filter_found{}; @@ -2813,6 +3005,119 @@ namespace Proc { constexpr size_t KTHREADD = 2; static std::unordered_set kernels_procs = {KTHREADD}; static std::unordered_set dead_procs; + struct gpu_proc_info { + uint64_t busy_ns{}; + uint64_t mem_bytes{}; + }; + + static auto proc_gpu_info(const fs::path& pid_path) -> gpu_proc_info { + std::unordered_map gpu_totals; + std::unordered_map gpu_mems; + + try { + for (const auto& d : fs::directory_iterator(pid_path / "fdinfo", fs::directory_options::skip_permission_denied)) { + ifstream fdread(d.path()); + if (not fdread.good()) continue; + + string driver, pdev, line; + std::unordered_map drm_values; + std::unordered_map engine_times; + std::unordered_map engine_capacity; + + while (getline(fdread, line)) { + auto split = line.find(':'); + if (split == string::npos) continue; + + const std::string_view key{line.data(), split}; + const auto value = trim_view(std::string_view{line}.substr(split + 1)); + + if (key == "drm-driver") { + driver = string{value}; + continue; + } + if (key == "drm-pdev") { + pdev = string{value}; + continue; + } + + uint64_t raw{}; + if (not parse_u64_prefix(value, raw)) continue; + drm_values[string{key}] = raw; + + if (not key.starts_with("drm-engine-")) continue; + + auto engine = key.substr(11); + + if (engine.starts_with("capacity-")) { + engine_capacity[string{engine.substr(9)}] = max((uint64_t)1, raw); + } + else { + engine_times[string{engine}] = raw; + } + } + + auto val = [&](const string& key) { return drm_values.contains(key) ? drm_values.at(key) : 0ULL; }; + + uint64_t fd_total{}; + uint64_t fd_mem{}; + if (driver == "amdgpu") { + fd_total = val("drm-engine-compute") + val("drm-engine-gfx"); + fd_mem = (val("drm-memory-gtt") + val("drm-memory-vram")) * 1024; + } + else if (driver == "i915") { + fd_total = val("drm-engine-render"); + fd_mem = (val("drm-total-local0") + val("drm-total-system0")) * 1024; + } + else if (driver == "v3d") { + fd_total = val("drm-engine-render"); + fd_mem = val("drm-total-memory") * 1024; + } + else if (driver == "xe") { + fd_total = val("drm-cycles-rcs") + val("drm-cycles-ccs"); + fd_mem = (val("drm-total-gtt") + val("drm-total-vram0")) * 1024; + } + else { + for (const auto& [engine, usage] : engine_times) { + const auto cap_it = engine_capacity.find(engine); + const auto capacity = cap_it == engine_capacity.end() ? 1ULL : max((uint64_t)1, cap_it->second); + fd_total += usage / capacity; + } + uint64_t fd_mem_resident{}; + uint64_t fd_mem_total{}; + for (const auto& [key, value] : drm_values) { + if (key.starts_with("drm-resident-")) fd_mem_resident += value * 1024; + else if (key.starts_with("drm-total-")) fd_mem_total += value * 1024; + } + fd_mem = fd_mem_resident > 0 ? fd_mem_resident : fd_mem_total; + } + + if (driver.empty() and (fd_total > 0 or fd_mem > 0)) driver = "unknown"; + if (driver.empty() or (fd_total == 0 and fd_mem == 0)) continue; + + const auto gpu_id = pdev.empty() ? driver : fmt::format("{}:{}", driver, pdev); + // Keep the highest per-GPU values to avoid double-counting shared fdinfo snapshots. + if (fd_total > 0 and (not gpu_totals.contains(gpu_id) or fd_total > gpu_totals.at(gpu_id))) { + gpu_totals[gpu_id] = fd_total; + } + + if (fd_mem > 0 and (not gpu_mems.contains(gpu_id) or fd_mem > gpu_mems.at(gpu_id))) { + gpu_mems[gpu_id] = fd_mem; + } + } + } + catch (const fs::filesystem_error&) { + return {}; + } + + gpu_proc_info total{}; + for (const auto& [_, usage] : gpu_totals) { + total.busy_ns += usage; + } + for (const auto& [_, usage] : gpu_mems) { + total.mem_bytes += usage; + } + return total; + } //* Get detailed info for selected process static void _collect_details(const size_t pid, const uint64_t uptime, vector& procs) { @@ -2918,14 +3223,17 @@ namespace Proc { const auto& sorting = Config::getS("proc_sorting"); auto reverse = Config::getB("proc_reversed"); const auto& filter = Config::getS("proc_filter"); + const bool gpu_only = Config::getB("proc_gpu_only"); auto per_core = Config::getB("proc_per_core"); auto should_filter_kernel = Config::getB("proc_filter_kernel"); auto tree = Config::getB("proc_tree"); auto show_detailed = Config::getB("show_detailed"); const auto pause_proc_list = Config::getB("pause_proc_list"); const size_t detailed_pid = Config::getI("detailed_pid"); - bool should_filter = current_filter != filter; + bool should_filter = current_filter != filter or current_gpu_only != gpu_only; + if (gpu_only) should_filter = true; if (should_filter) current_filter = filter; + if (should_filter) current_gpu_only = gpu_only; bool sorted_change = (sorting != current_sort or reverse != current_rev or should_filter); bool tree_mode_change = tree != is_tree_mode; if (sorted_change) { @@ -2943,6 +3251,13 @@ namespace Proc { const int cmult = (per_core) ? Shared::coreCount : 1; bool got_detailed = false; + uint64_t gpu_timestamp{}; + uint64_t gpu_time_passed{}; + #if defined(GPU_SUPPORT) + const std::unordered_map* nvml_proc_stats = nullptr; + #else + const void* nvml_proc_stats = nullptr; + #endif static size_t proc_clear_count{}; @@ -2954,6 +3269,16 @@ namespace Proc { else { should_filter = true; found.clear(); + gpu_timestamp = static_cast(get_monotonicTimeUSec()) * 1000ULL; + if (old_gputimes > 0 and gpu_timestamp > old_gputimes) + gpu_time_passed = gpu_timestamp - old_gputimes; + + #if defined(GPU_SUPPORT) + if (Gpu::Nvml::initialized) { + Gpu::Nvml::collect_process_stats(); + nvml_proc_stats = &Gpu::Nvml::get_process_stats(); + } + #endif //? First make sure kernel proc cache is cleared. if (should_filter_kernel and ++proc_clear_count >= 256) { @@ -3174,6 +3499,31 @@ namespace Proc { //? Process cpu usage since last update new_proc.cpu_p = clamp(round(cmult * 1000 * (cpu_t - new_proc.cpu_t) / max((uint64_t)1, cputimes - old_cputimes)) / 10.0, 0.0, 100.0 * Shared::coreCount); + bool used_nvml_gpu_stats = false; + #if defined(GPU_SUPPORT) + if (nvml_proc_stats != nullptr) { + auto found_proc = nvml_proc_stats->find(new_proc.pid); + if (found_proc != nvml_proc_stats->end()) { + new_proc.gpu_p = clamp(found_proc->second.gpu, 0.0, 100.0); + new_proc.gpu_m = found_proc->second.mem; + new_proc.gpu_t = 0; + used_nvml_gpu_stats = true; + } + } + #endif + + if (not used_nvml_gpu_stats) { + //? Process GPU usage since last update from /proc/[pid]/fdinfo + const auto gpu_stats = proc_gpu_info(d.path()); + const uint64_t gpu_t = gpu_stats.busy_ns; + if (gpu_time_passed > 0 and new_proc.gpu_t > 0 and gpu_t >= new_proc.gpu_t) + new_proc.gpu_p = clamp(round(1000.0 * (gpu_t - new_proc.gpu_t) / gpu_time_passed) / 10.0, 0.0, 100.0); + else + new_proc.gpu_p = 0.0; + new_proc.gpu_t = gpu_t; + new_proc.gpu_m = gpu_stats.mem_bytes; + } + //? Process cumulative cpu usage since process start new_proc.cpu_c = (double)cpu_t / max(1.0, (uptime * Shared::clkTck) - new_proc.cpu_s); @@ -3202,6 +3552,8 @@ namespace Proc { //? Reset cpu usage for dead processes if paused and option is set if (!keep_dead_proc_usage) { r.cpu_p = 0.0; + r.gpu_p = 0.0; + r.gpu_m = 0; r.mem = 0; } } @@ -3218,6 +3570,7 @@ namespace Proc { } old_cputimes = cputimes; + old_gputimes = gpu_timestamp; } //* ---------------------------------------------Collection done----------------------------------------------- @@ -3225,8 +3578,8 @@ namespace Proc { if (should_filter) { filter_found = 0; for (auto& p : current_procs) { - if (not tree and not filter.empty()) { - if (!matches_filter(p, filter)) { + if ((not tree and not filter.empty()) or gpu_only) { + if (not matches_filter(p, filter)) { p.filtered = true; filter_found++; } else { diff --git a/src/linux/intel_gpu_top/CMakeLists.txt b/src/linux/intel_gpu_top/CMakeLists.txt index e13e19397..533a1d8ac 100644 --- a/src/linux/intel_gpu_top/CMakeLists.txt +++ b/src/linux/intel_gpu_top/CMakeLists.txt @@ -7,6 +7,8 @@ add_library(igt OBJECT intel_name_lookup_shim.c ) +target_compile_definitions(igt PRIVATE _GNU_SOURCE) + if(BTOP_LTO) # We have checked LTO support already and it's supported :) set_target_properties(igt PROPERTIES INTERPROCEDURAL_OPTIMIZATION ON)