diff --git a/src/btop_config.cpp b/src/btop_config.cpp
index c9211771c..98fea6f61 100644
--- a/src/btop_config.cpp
+++ b/src/btop_config.cpp
@@ -104,7 +104,7 @@ namespace Config {
 
 		{"update_ms", 			"#* Update time in milliseconds, recommended 2000 ms or above for better sample times for graphs."},
 
-		{"proc_sorting",		"#* Processes sorting, \"pid\" \"program\" \"arguments\" \"threads\" \"user\" \"memory\" \"cpu lazy\" \"cpu direct\",\n"
+		{"proc_sorting",		"#* Processes sorting, \"pid\" \"program\" \"arguments\" \"threads\" \"user\" \"memory\" \"gpu\" \"gpu memory\" \"cpu lazy\" \"cpu direct\",\n"
 								"#* \"cpu lazy\" sorts top process over time (easier to follow), \"cpu direct\" updates top process directly."},
 
 		{"proc_reversed",		"#* Reverse sorting order, True or False."},
@@ -121,6 +121,10 @@ namespace Config {
 
 		{"proc_cpu_graphs",     "#* Show cpu graph for each process."},
 
+		{"proc_gpu_graphs",     "#* Show gpu graph for each process."},
+
+		{"proc_gpu_only",       "#* Show only processes with active GPU usage or GPU memory allocation in the process list."},
+
 		{"proc_info_smaps",		"#* Use /proc/[pid]/smaps for memory information in the process info box (very slow but more accurate)"},
 
 		{"proc_left",			"#* Show proc box on left side of screen instead of right."},
@@ -304,6 +308,8 @@ namespace Config {
 		{"proc_per_core", false},
 		{"proc_mem_bytes", true},
 		{"proc_cpu_graphs", true},
+		{"proc_gpu_graphs", true},
+		{"proc_gpu_only", false},
 		{"proc_info_smaps", false},
 		{"proc_left", false},
 		{"proc_filter_kernel", false},
diff --git a/src/btop_draw.cpp b/src/btop_draw.cpp
index 5f19e4b7d..93d72c137 100644
--- a/src/btop_draw.cpp
+++ b/src/btop_draw.cpp
@@ -1577,8 +1577,10 @@ namespace Proc {
 	int scroll_pos;
 	string selected_name;
 	std::unordered_map<size_t, Draw::Graph> p_graphs;
+	std::unordered_map<size_t, Draw::Graph> p_graphs_gpu;
 	std::unordered_map<size_t, bool> p_wide_cmd;
 	std::unordered_map<size_t, int> p_counters;
+	std::unordered_map<size_t, int> p_counters_gpu;
 	int counter = 0;
 	Draw::TextEdit filter;
 	Draw::Graph detailed_cpu_graph;
@@ -1681,7 +1683,15 @@ namespace Proc {
 		auto& graph_bg = Symbols::graph_symbols.at((graph_symbol == "default" ? Config::getS("graph_symbol") + "_up" : graph_symbol + "_up")).at(6);
 		auto mem_bytes = Config::getB("proc_mem_bytes");
 		auto vim_keys = Config::getB("vim_keys");
-		auto show_graphs = Config::getB("proc_cpu_graphs");
+		auto show_cpu_graphs = Config::getB("proc_cpu_graphs");
+		auto show_gpu_graphs = Config::getB("proc_gpu_graphs");
+		#if defined(__linux__)
+			const bool show_gpu = width >= ((show_cpu_graphs or show_gpu_graphs) ? 70 : 65);
+			const bool show_gpu_mem = width >= ((show_cpu_graphs or show_gpu_graphs) ? 78 : 73);
+		#else
+			const bool show_gpu = false;
+			const bool show_gpu_mem = false;
+		#endif
 		const auto pause_proc_list = Config::getB("pause_proc_list");
 		auto follow_process = Config::getB("follow_process"); 
 		int followed_pid = Config::getI("followed_pid");
@@ -1772,10 +1782,14 @@ namespace Proc {
 			//? Adapt sizes of text fields
 			user_size = (width < 75 ? 5 : 10);
 			thread_size = (width < 75 ? - 1 : 4);
-			prog_size = (width > 70 ? 16 : ( width > 55 ? 8 : width - user_size - thread_size - 33));
-			cmd_size = (width > 55 ? width - prog_size - user_size - thread_size - 33 : -1);
-			tree_size = width - user_size - thread_size - 23;
-			if (not show_graphs) {
+			const int gpu_cols = (show_gpu_mem ? 6 : 0) + (show_gpu ? 6 : 0);
+			const int gpu_graph_cols = (show_gpu and show_gpu_graphs ? 5 : 0);
+			const int proc_fixed = 33 + gpu_cols + gpu_graph_cols;
+			const int tree_fixed = 23 + gpu_cols + gpu_graph_cols;
+			prog_size = (width > 70 ? 16 : ( width > 55 ? 8 : width - user_size - thread_size - proc_fixed));
+			cmd_size = (width > 55 ? width - prog_size - user_size - thread_size - proc_fixed : -1);
+			tree_size = width - user_size - thread_size - tree_fixed;
+			if (not show_cpu_graphs) {
 				cmd_size += 5;
 				tree_size += 5;
 			}
@@ -1880,15 +1894,24 @@ namespace Proc {
 
 			//? pause, per-core, reverse, tree and sorting
 			const auto& sorting = Config::getS("proc_sorting");
+			const auto gpu_only = Config::getB("proc_gpu_only");
 			const int sort_len = sorting.size();
 			const int sort_pos = x + width - sort_len - 8;
 
+			if (width > 70 + sort_len) {
+				fmt::format_to(std::back_inserter(out), "{}{}{}{}{}{}{}{}{}{}{}",
+					Mv::to(y, sort_pos - 43), title_left, gpu_only ? Fx::b : "",
+					Theme::c("hi_fg"), 'g', Theme::c("title"), "pu-",
+					Theme::c("title"), "only",
+					Fx::ub, title_right);
+				Input::mouse_mappings["g"] = {y, sort_pos - 42, 1, 8};
+			}
 			if (width > 60 + sort_len) {
-			    fmt::format_to(std::back_inserter(out), "{}{}{}{}{}{}{}{}{}{}{}",
+				fmt::format_to(std::back_inserter(out), "{}{}{}{}{}{}{}{}{}{}{}",
 					Mv::to(y, sort_pos - 32), title_left, pause_proc_list ? Fx::b : "",
 					Theme::c("title"), "pa", Theme::c("hi_fg"), 'u', Theme::c("title"), "se",
-			    	Fx::ub, title_right);
-			    Input::mouse_mappings["u"] = {y, sort_pos - 31, 1, 5};
+					Fx::ub, title_right);
+				Input::mouse_mappings["u"] = {y, sort_pos - 31, 1, 5};
 			}
 			if (width > 55 + sort_len) {
 				out += Mv::to(y, sort_pos - 25) + title_left + (Config::getB("proc_per_core") ? Fx::b : "") + Theme::c("title")
@@ -1958,7 +1981,10 @@ namespace Proc {
 			out += (thread_size > 0 ? Mv::l(4) + "Threads: " : "")
 					+ ljust("User:", user_size) + ' '
 					+ rjust((mem_bytes ? "MemB" : "Mem%"), 5) + ' '
-					+ rjust("Cpu%", (show_graphs ? 10 : 5)) + Fx::ub;
+					+ rjust("Cpu%", (show_cpu_graphs ? 10 : 5))
+					+ (show_gpu_mem ? string{" "} + rjust("GMem", 5) : "")
+					+ (show_gpu ? string{" "} + rjust("Gpu%", (show_gpu_graphs ? 10 : 5)) : "")
+					+ Fx::ub;
 		}
 		//* End of redraw block
 
@@ -2021,9 +2047,9 @@ namespace Proc {
 				selected_depth = p.depth;
 			}
 
-			//? Update graphs for processes with above 0.0% cpu usage, delete if below 0.1% 10x times
-			bool has_graph = show_graphs ? p_counters.contains(p.pid) : false;
-			if (show_graphs and ((p.cpu_p > 0 and not has_graph) or (not data_same and has_graph))) {
+			//? Update cpu graphs for processes with above 0.0% cpu usage, delete if below 0.1% 10x times
+			bool has_graph = show_cpu_graphs ? p_counters.contains(p.pid) : false;
+			if (show_cpu_graphs and ((p.cpu_p > 0 and not has_graph) or (not data_same and has_graph))) {
 				if (not has_graph) {
 					p_graphs[p.pid] = Draw::Graph{5, 1, "", {}, graph_symbol};
 					p_counters[p.pid] = 0;
@@ -2036,6 +2062,21 @@ namespace Proc {
 					p_counters[p.pid] = 0;
 			}
 
+			//? Update gpu graphs for processes with above 0.0% gpu usage, delete if below 0.1% 10x times
+			bool has_gpu_graph = (show_gpu and show_gpu_graphs) ? p_counters_gpu.contains(p.pid) : false;
+			if ((show_gpu and show_gpu_graphs) and ((p.gpu_p > 0 and not has_gpu_graph) or (not data_same and has_gpu_graph))) {
+				if (not has_gpu_graph) {
+					p_graphs_gpu[p.pid] = Draw::Graph{5, 1, "", {}, graph_symbol};
+					p_counters_gpu[p.pid] = 0;
+				}
+				else if (p.gpu_p < 0.1 and ++p_counters_gpu[p.pid] >= 10) {
+					if (p_graphs_gpu.contains(p.pid)) p_graphs_gpu.erase(p.pid);
+					p_counters_gpu.erase(p.pid);
+				}
+				else
+					p_counters_gpu[p.pid] = 0;
+			}
+
 			out += Fx::reset;
 
 			//? Set correct gradient colors if enabled
@@ -2110,6 +2151,18 @@ namespace Proc {
 				if (cpu_str.ends_with('.')) cpu_str.pop_back();
 				cpu_str += "k";
 			}
+			string gpu_str;
+			if (show_gpu) {
+				if (p.gpu_p <= 0.0 and p.gpu_m == 0) {
+					gpu_str = "-";
+				}
+				else {
+					gpu_str = fmt::format("{:.1f}", clamp(p.gpu_p, 0.0, 100.0));
+					if (gpu_str.size() > 4) gpu_str.resize(4);
+					if (gpu_str.ends_with('.')) gpu_str.pop_back();
+				}
+			}
+			const string gpu_mem_str = show_gpu_mem ? (p.gpu_m == 0 ? "-" : floating_humanizer(p.gpu_m, true)) : "";
 			string mem_str = (mem_bytes ? floating_humanizer(p.mem, true) : "");
 			if (not mem_bytes) {
 				double mem_p = clamp((double)p.mem * 100 / totalMem, 0.0, 100.0);
@@ -2131,9 +2184,13 @@ namespace Proc {
 			out += (thread_size > 0 ? t_color + rjust(proc_threads_string, thread_size) + ' ' + end : "" )
 				+ g_color + ljust((cmp_greater(p.user.size(), user_size) ? p.user.substr(0, user_size - 1) + '+' : p.user), user_size) + ' '
 				+ m_color + rjust(mem_str, 5) + end + ' '
-				+ (is_selected or is_followed ? "" : Theme::c("inactive_fg")) + (show_graphs ? graph_bg * 5: "")
+				+ (is_selected or is_followed ? "" : Theme::c("inactive_fg")) + (show_cpu_graphs ? graph_bg * 5: "")
 				+ (p_graphs.contains(p.pid) ? Mv::l(5) + c_color + p_graphs.at(p.pid)({(p.cpu_p >= 0.1 and p.cpu_p < 5 ? 5ll : (long long)round(p.cpu_p))}, data_same) : "") + end + ' '
-				+ c_color + rjust(cpu_str, 4) + "  " + end;
+				+ c_color + rjust(cpu_str, 4) + ' ' + end
+				+ (show_gpu_mem ? c_color + rjust(gpu_mem_str, 5) + ' ' + end : "")
+				+ (is_selected or is_followed ? "" : Theme::c("inactive_fg")) + ((show_gpu and show_gpu_graphs) ? graph_bg * 5 : "")
+				+ (p_graphs_gpu.contains(p.pid) ? Mv::l(5) + c_color + p_graphs_gpu.at(p.pid)({(p.gpu_p >= 0.1 and p.gpu_p < 5 ? 5ll : (long long)round(p.gpu_p))}, data_same) : "") + end
+				+ (show_gpu ? c_color + rjust(gpu_str, 5) + ' ' + end : "");
 			if (lc++ > height - 5) break;
 			else if (lc > height - 5 and proc_banner_shown) break;
 		}
@@ -2182,6 +2239,14 @@ namespace Proc {
 				return rng::find(plist, pair.first, &proc_info::pid) == plist.end();
 			});
 
+			std::erase_if(p_graphs_gpu, [&](const auto& pair) {
+				return rng::find(plist, pair.first, &proc_info::pid) == plist.end();
+			});
+
+			std::erase_if(p_counters_gpu, [&](const auto& pair) {
+				return rng::find(plist, pair.first, &proc_info::pid) == plist.end();
+			});
+
 			std::erase_if(p_wide_cmd, [&](const auto& pair) {
 				return rng::find(plist, pair.first, &proc_info::pid) == plist.end();
 			});
@@ -2230,7 +2295,9 @@ namespace Draw {
 		Runner::redraw = true;
 		if (not (Proc::resized or Global::resized)) {
 			Proc::p_counters.clear();
+			Proc::p_counters_gpu.clear();
 			Proc::p_graphs.clear();
+			Proc::p_graphs_gpu.clear();
 		}
 		if (Menu::active) Menu::redraw = true;
 
diff --git a/src/btop_input.cpp b/src/btop_input.cpp
index 28a966773..245bb0664 100644
--- a/src/btop_input.cpp
+++ b/src/btop_input.cpp
@@ -377,6 +377,11 @@ namespace Input {
 				else if (key == "c")
 					Config::flip("proc_per_core");
 
+				else if (key == "g") {
+					Config::flip("proc_gpu_only");
+					Config::set("update_following", true);
+				}
+
 				else if (key == "%")
 					Config::flip("proc_mem_bytes");
 
diff --git a/src/btop_menu.cpp b/src/btop_menu.cpp
index fbaadf32d..897baf084 100644
--- a/src/btop_menu.cpp
+++ b/src/btop_menu.cpp
@@ -201,6 +201,7 @@ namespace Menu {
 		{"f, /", "To enter a process filter. Start with ! for regex."},
 		{"F", "Follow selected process."},
 		{"u", "Pause process list."},
+		{"g", "Toggle GPU-only process filter."},
 		{"delete", "Clear any entered filter."},
 		{"c", "Toggle per-core cpu usage of processes."},
 		{"r", "Reverse sorting order in processes box."},
@@ -817,7 +818,8 @@ namespace Menu {
 				"",
 				"Possible values:",
 				"\"pid\", \"program\", \"arguments\", \"threads\",",
-				"\"user\", \"memory\", \"cpu lazy\" and",
+				"\"user\", \"memory\", \"gpu\",",
+				"\"gpu memory\", \"cpu lazy\" and",
 				"\"cpu direct\".",
 				"",
 				"\"cpu lazy\" updates top process over time.",
@@ -872,6 +874,16 @@ namespace Menu {
 				"Show cpu graph for each process.",
 				"",
 				"True or False"},
+			{"proc_gpu_graphs",
+				"Show gpu graph for each process.",
+				"",
+				"True or False"},
+			{"proc_gpu_only",
+				"Show only GPU-active processes.",
+				"",
+				"When enabled, only processes with",
+				"non-zero GPU usage or GPU memory",
+				"allocation are shown."},
 			{"proc_filter_kernel",
 				"(Linux) Filter kernel processes from output.",
 				"",
diff --git a/src/btop_shared.cpp b/src/btop_shared.cpp
index 0990fe681..3c8526a7d 100644
--- a/src/btop_shared.cpp
+++ b/src/btop_shared.cpp
@@ -109,8 +109,10 @@ bool set_priority(pid_t pid, int priority) {
 			case 3: rng::stable_sort(proc_vec, rng::less{}, &proc_info::threads);	break;
 			case 4: rng::stable_sort(proc_vec, rng::greater{}, &proc_info::user); 		break;
 			case 5: rng::stable_sort(proc_vec, rng::less{}, &proc_info::mem); 		break;
-			case 6: rng::stable_sort(proc_vec, rng::less{}, &proc_info::cpu_p);		break;
-			case 7: rng::stable_sort(proc_vec, rng::less{}, &proc_info::cpu_c);		break;
+			case 6: rng::stable_sort(proc_vec, rng::less{}, &proc_info::gpu_p);		break;
+			case 7: rng::stable_sort(proc_vec, rng::less{}, &proc_info::gpu_m);		break;
+			case 8: rng::stable_sort(proc_vec, rng::less{}, &proc_info::cpu_p);		break;
+			case 9: rng::stable_sort(proc_vec, rng::less{}, &proc_info::cpu_c);		break;
 			}
 		}
 		else {
@@ -121,8 +123,10 @@ bool set_priority(pid_t pid, int priority) {
 			case 3: rng::stable_sort(proc_vec, rng::greater{}, &proc_info::threads);	break;
 			case 4: rng::stable_sort(proc_vec, rng::less{}, &proc_info::user);		break;
 			case 5: rng::stable_sort(proc_vec, rng::greater{}, &proc_info::mem); 		break;
-			case 6: rng::stable_sort(proc_vec, rng::greater{}, &proc_info::cpu_p);   	break;
-			case 7: rng::stable_sort(proc_vec, rng::greater{}, &proc_info::cpu_c);   	break;
+			case 6: rng::stable_sort(proc_vec, rng::greater{}, &proc_info::gpu_p);   	break;
+			case 7: rng::stable_sort(proc_vec, rng::greater{}, &proc_info::gpu_m);   	break;
+			case 8: rng::stable_sort(proc_vec, rng::greater{}, &proc_info::cpu_p);   	break;
+			case 9: rng::stable_sort(proc_vec, rng::greater{}, &proc_info::cpu_c);   	break;
 			}
 		}
 
@@ -150,16 +154,20 @@ bool set_priority(pid_t pid, int priority) {
 				switch (v_index(sort_vector, sorting)) {
 				case 3: rng::stable_sort(proc_vec, [](const auto& a, const auto& b) { return a.entry.get().threads < b.entry.get().threads; });	break;
 				case 5: rng::stable_sort(proc_vec, [](const auto& a, const auto& b) { return a.entry.get().mem < b.entry.get().mem; });	break;
-				case 6: rng::stable_sort(proc_vec, [](const auto& a, const auto& b) { return a.entry.get().cpu_p < b.entry.get().cpu_p; });	break;
-				case 7: rng::stable_sort(proc_vec, [](const auto& a, const auto& b) { return a.entry.get().cpu_c < b.entry.get().cpu_c; });	break;
+				case 6: rng::stable_sort(proc_vec, [](const auto& a, const auto& b) { return a.entry.get().gpu_p < b.entry.get().gpu_p; });	break;
+				case 7: rng::stable_sort(proc_vec, [](const auto& a, const auto& b) { return a.entry.get().gpu_m < b.entry.get().gpu_m; });	break;
+				case 8: rng::stable_sort(proc_vec, [](const auto& a, const auto& b) { return a.entry.get().cpu_p < b.entry.get().cpu_p; });	break;
+				case 9: rng::stable_sort(proc_vec, [](const auto& a, const auto& b) { return a.entry.get().cpu_c < b.entry.get().cpu_c; });	break;
 				}
 			}
 			else {
 				switch (v_index(sort_vector, sorting)) {
 				case 3: rng::stable_sort(proc_vec, [](const auto& a, const auto& b) { return a.entry.get().threads > b.entry.get().threads; });	break;
 				case 5: rng::stable_sort(proc_vec, [](const auto& a, const auto& b) { return a.entry.get().mem > b.entry.get().mem; });	break;
-				case 6: rng::stable_sort(proc_vec, [](const auto& a, const auto& b) { return a.entry.get().cpu_p > b.entry.get().cpu_p; });	break;
-				case 7: rng::stable_sort(proc_vec, [](const auto& a, const auto& b) { return a.entry.get().cpu_c > b.entry.get().cpu_c; });	break;
+				case 6: rng::stable_sort(proc_vec, [](const auto& a, const auto& b) { return a.entry.get().gpu_p > b.entry.get().gpu_p; });	break;
+				case 7: rng::stable_sort(proc_vec, [](const auto& a, const auto& b) { return a.entry.get().gpu_m > b.entry.get().gpu_m; });	break;
+				case 8: rng::stable_sort(proc_vec, [](const auto& a, const auto& b) { return a.entry.get().cpu_p > b.entry.get().cpu_p; });	break;
+				case 9: rng::stable_sort(proc_vec, [](const auto& a, const auto& b) { return a.entry.get().cpu_c > b.entry.get().cpu_c; });	break;
 				}
 			}
 		}
@@ -173,6 +181,10 @@ bool set_priority(pid_t pid, int priority) {
 	}
 
 	auto matches_filter(const proc_info& proc, const std::string& filter) -> bool {
+		if (Config::getB("proc_gpu_only") and proc.gpu_p <= 0.0 and proc.gpu_m == 0) {
+			return false;
+		}
+
 		if (filter.starts_with("!")) {
 			if (filter.size() == 1) {
 				return true;
@@ -242,6 +254,8 @@ bool set_priority(pid_t pid, int priority) {
 				if (p.state != 'X') {
 					cur_proc.cpu_p += p.cpu_p;
 					cur_proc.cpu_c += p.cpu_c;
+					cur_proc.gpu_p += p.gpu_p;
+					cur_proc.gpu_m += p.gpu_m;
 					cur_proc.mem += p.mem;
 					cur_proc.threads += p.threads;
 				}
@@ -251,6 +265,8 @@ bool set_priority(pid_t pid, int priority) {
 			else if (Config::getB("proc_aggregate") and p.state != 'X') {
 				cur_proc.cpu_p += p.cpu_p;
 				cur_proc.cpu_c += p.cpu_c;
+				cur_proc.gpu_p += p.gpu_p;
+				cur_proc.gpu_m += p.gpu_m;
 				cur_proc.mem += p.mem;
 				cur_proc.threads += p.threads;
 			}
diff --git a/src/btop_shared.hpp b/src/btop_shared.hpp
index 96f4de32a..cc40182f1 100644
--- a/src/btop_shared.hpp
+++ b/src/btop_shared.hpp
@@ -373,6 +373,8 @@ namespace Proc {
 		"threads",
 		"user",
 		"memory",
+		"gpu",
+		"gpu memory",
 		"cpu direct",
 		"cpu lazy",
 	};
@@ -404,11 +406,14 @@ namespace Proc {
 		uint64_t mem{};
 		double cpu_p{};         // defaults to = 0.0
 		double cpu_c{};         // defaults to = 0.0
+		double gpu_p{};         // defaults to = 0.0
+		uint64_t gpu_m{};
 		char state = '0';
 		int64_t p_nice{};
 		uint64_t ppid{};
 		uint64_t cpu_s{};
 		uint64_t cpu_t{};
+		uint64_t gpu_t{};
 		uint64_t death_time{};
 		string prefix{};        // defaults to ""
 		size_t depth{};
diff --git a/src/linux/btop_collect.cpp b/src/linux/btop_collect.cpp
index d8ab93c4b..574066ecc 100644
--- a/src/linux/btop_collect.cpp
+++ b/src/linux/btop_collect.cpp
@@ -110,6 +110,20 @@ long long get_monotonicTimeUSec()
 	return time.tv_sec * 1000000 + time.tv_nsec / 1000;
 }
 
+auto trim_view(std::string_view value) -> std::string_view {
+	while (not value.empty() and is_in(value.front(), ' ', '\t')) value.remove_prefix(1);
+	while (not value.empty() and is_in(value.back(), ' ', '\t', '\r')) value.remove_suffix(1);
+	return value;
+}
+
+auto parse_u64_prefix(std::string_view value, uint64_t& out) -> bool {
+	value = trim_view(value);
+	if (value.empty()) return false;
+
+	auto [ptr, err] = std::from_chars(value.data(), value.data() + value.size(), out);
+	return err == std::errc{} and ptr != value.data();
+}
+
 }
 
 namespace Cpu {
@@ -149,16 +163,19 @@ namespace Cpu {
 namespace Gpu {
 	vector<gpu_info> gpus;
 	//? NVIDIA data collection
-	namespace Nvml {
-		//? NVML defines, structs & typedefs
-		#define NVML_DEVICE_NAME_BUFFER_SIZE        64
-		#define NVML_SUCCESS                         0
-		#define NVML_TEMPERATURE_THRESHOLD_SHUTDOWN  0
-		#define NVML_CLOCK_GRAPHICS                  0
-		#define NVML_CLOCK_MEM                       2
-		#define NVML_TEMPERATURE_GPU                 0
-		#define NVML_PCIE_UTIL_TX_BYTES              0
-		#define NVML_PCIE_UTIL_RX_BYTES              1
+		namespace Nvml {
+			//? NVML defines, structs & typedefs
+			#define NVML_DEVICE_NAME_BUFFER_SIZE        64
+			#define NVML_SUCCESS                         0
+			#define NVML_ERROR_NOT_FOUND                 6
+			#define NVML_ERROR_INSUFFICIENT_SIZE         7
+			#define NVML_TEMPERATURE_THRESHOLD_SHUTDOWN  0
+			#define NVML_CLOCK_GRAPHICS                  0
+			#define NVML_CLOCK_MEM                       2
+			#define NVML_TEMPERATURE_GPU                 0
+			#define NVML_PCIE_UTIL_TX_BYTES              0
+			#define NVML_PCIE_UTIL_RX_BYTES              1
+			#define NVML_VALUE_NOT_AVAILABLE_ULL         (std::numeric_limits<unsigned long long>::max())
 
 		typedef void* nvmlDevice_t; // we won't be accessing any of the underlying struct's properties, so this is fine
 		typedef int nvmlReturn_t, // enums are basically ints
@@ -168,8 +185,37 @@ namespace Gpu {
 					nvmlTemperatureSensors_t,
 					nvmlPcieUtilCounter_t;
 
-		struct nvmlUtilization_t {unsigned int gpu, memory;};
-		struct nvmlMemory_t {unsigned long long total, free, used;};
+			struct nvmlUtilization_t {unsigned int gpu, memory;};
+			struct nvmlMemory_t {unsigned long long total, free, used;};
+			struct nvmlProcessUtilizationSample_t {
+				unsigned int pid;
+				unsigned long long timeStamp;
+				unsigned int smUtil;
+				unsigned int memUtil;
+				unsigned int encUtil;
+				unsigned int decUtil;
+			};
+			struct nvmlProcessInfo_t {
+				unsigned int pid;
+				unsigned long long usedGpuMemory;
+				unsigned int gpuInstanceId;
+				unsigned int computeInstanceId;
+				unsigned long long usedGpuCcProtectedMemory;
+			};
+			struct nvmlProcessInfo_v2_t {
+				unsigned int pid;
+				unsigned long long usedGpuMemory;
+				unsigned int gpuInstanceId;
+				unsigned int computeInstanceId;
+			};
+			struct nvmlProcessInfo_v1_t {
+				unsigned int pid;
+				unsigned long long usedGpuMemory;
+			};
+			struct proc_stat {
+				double gpu{};
+				uint64_t mem{};
+			};
 
 		//? Function pointers
 		const char* (*nvmlErrorString)(nvmlReturn_t);
@@ -186,19 +232,32 @@ namespace Gpu {
 		nvmlReturn_t (*nvmlDeviceGetPowerState)(nvmlDevice_t, nvmlPstates_t*);
 		nvmlReturn_t (*nvmlDeviceGetTemperature)(nvmlDevice_t, nvmlTemperatureSensors_t, unsigned int*);
 		nvmlReturn_t (*nvmlDeviceGetMemoryInfo)(nvmlDevice_t, nvmlMemory_t*);
-		nvmlReturn_t (*nvmlDeviceGetPcieThroughput)(nvmlDevice_t, nvmlPcieUtilCounter_t, unsigned int*);
-		nvmlReturn_t (*nvmlDeviceGetEncoderUtilization)(nvmlDevice_t, unsigned int*, unsigned int*);
-		nvmlReturn_t (*nvmlDeviceGetDecoderUtilization)(nvmlDevice_t, unsigned int*, unsigned int*);
-
-		//? Data
-		void* nvml_dl_handle;
-		bool initialized = false;
-		bool init();
-		bool shutdown();
-		template <bool is_init> bool collect(gpu_info* gpus_slice);
-		vector<nvmlDevice_t> devices;
-		unsigned int device_count = 0;
-	}
+			nvmlReturn_t (*nvmlDeviceGetPcieThroughput)(nvmlDevice_t, nvmlPcieUtilCounter_t, unsigned int*);
+			nvmlReturn_t (*nvmlDeviceGetEncoderUtilization)(nvmlDevice_t, unsigned int*, unsigned int*);
+			nvmlReturn_t (*nvmlDeviceGetDecoderUtilization)(nvmlDevice_t, unsigned int*, unsigned int*);
+			nvmlReturn_t (*nvmlDeviceGetProcessUtilization)(nvmlDevice_t, nvmlProcessUtilizationSample_t*, unsigned int*, unsigned long long);
+			nvmlReturn_t (*nvmlDeviceGetGraphicsRunningProcesses_v1)(nvmlDevice_t, unsigned int*, nvmlProcessInfo_v1_t*);
+			nvmlReturn_t (*nvmlDeviceGetGraphicsRunningProcesses_v2)(nvmlDevice_t, unsigned int*, nvmlProcessInfo_v2_t*);
+			nvmlReturn_t (*nvmlDeviceGetGraphicsRunningProcesses_v3)(nvmlDevice_t, unsigned int*, nvmlProcessInfo_t*);
+			nvmlReturn_t (*nvmlDeviceGetComputeRunningProcesses_v1)(nvmlDevice_t, unsigned int*, nvmlProcessInfo_v1_t*);
+			nvmlReturn_t (*nvmlDeviceGetComputeRunningProcesses_v2)(nvmlDevice_t, unsigned int*, nvmlProcessInfo_v2_t*);
+			nvmlReturn_t (*nvmlDeviceGetComputeRunningProcesses_v3)(nvmlDevice_t, unsigned int*, nvmlProcessInfo_t*);
+
+			//? Data
+			void* nvml_dl_handle;
+			bool initialized = false;
+			bool process_memory_functions_available = false;
+			bool process_utilization_function_available = false;
+			bool init();
+			bool shutdown();
+			template <bool is_init> bool collect(gpu_info* gpus_slice);
+			bool collect_process_stats();
+			auto get_process_stats() -> const std::unordered_map<size_t, proc_stat>&;
+			vector<nvmlDevice_t> devices;
+			vector<unsigned long long> process_last_timestamps;
+			std::unordered_map<size_t, proc_stat> process_stats;
+			unsigned int device_count = 0;
+		}
 
 	//? AMD data collection
 	namespace Rsmi {
@@ -1219,6 +1278,7 @@ namespace Gpu {
  			}
 
 			auto load_nvml_sym = [&](const char sym_name[]) {
+				dlerror();
 				auto sym = dlsym(nvml_dl_handle, sym_name);
 				auto err = dlerror();
 				if (err != nullptr) {
@@ -1227,6 +1287,13 @@ namespace Gpu {
 				} else return sym;
 			};
 
+			auto try_load_nvml_sym = [&](const char sym_name[]) {
+				dlerror();
+				auto sym = dlsym(nvml_dl_handle, sym_name);
+				(void)dlerror();
+				return sym;
+			};
+
             #define LOAD_SYM(NAME)  if ((NAME = (decltype(NAME))load_nvml_sym(#NAME)) == nullptr) return false
 
 		    LOAD_SYM(nvmlErrorString);
@@ -1246,6 +1313,21 @@ namespace Gpu {
 		    LOAD_SYM(nvmlDeviceGetPcieThroughput);
 			LOAD_SYM(nvmlDeviceGetEncoderUtilization);
 			LOAD_SYM(nvmlDeviceGetDecoderUtilization);
+			nvmlDeviceGetProcessUtilization = (decltype(nvmlDeviceGetProcessUtilization))try_load_nvml_sym("nvmlDeviceGetProcessUtilization");
+			nvmlDeviceGetGraphicsRunningProcesses_v1 = (decltype(nvmlDeviceGetGraphicsRunningProcesses_v1))try_load_nvml_sym("nvmlDeviceGetGraphicsRunningProcesses");
+			nvmlDeviceGetGraphicsRunningProcesses_v2 = (decltype(nvmlDeviceGetGraphicsRunningProcesses_v2))try_load_nvml_sym("nvmlDeviceGetGraphicsRunningProcesses_v2");
+			nvmlDeviceGetGraphicsRunningProcesses_v3 = (decltype(nvmlDeviceGetGraphicsRunningProcesses_v3))try_load_nvml_sym("nvmlDeviceGetGraphicsRunningProcesses_v3");
+			nvmlDeviceGetComputeRunningProcesses_v1 = (decltype(nvmlDeviceGetComputeRunningProcesses_v1))try_load_nvml_sym("nvmlDeviceGetComputeRunningProcesses");
+			nvmlDeviceGetComputeRunningProcesses_v2 = (decltype(nvmlDeviceGetComputeRunningProcesses_v2))try_load_nvml_sym("nvmlDeviceGetComputeRunningProcesses_v2");
+			nvmlDeviceGetComputeRunningProcesses_v3 = (decltype(nvmlDeviceGetComputeRunningProcesses_v3))try_load_nvml_sym("nvmlDeviceGetComputeRunningProcesses_v3");
+			process_utilization_function_available = nvmlDeviceGetProcessUtilization != nullptr;
+			process_memory_functions_available =
+				nvmlDeviceGetGraphicsRunningProcesses_v3 != nullptr or
+				nvmlDeviceGetGraphicsRunningProcesses_v2 != nullptr or
+				nvmlDeviceGetGraphicsRunningProcesses_v1 != nullptr or
+				nvmlDeviceGetComputeRunningProcesses_v3 != nullptr or
+				nvmlDeviceGetComputeRunningProcesses_v2 != nullptr or
+				nvmlDeviceGetComputeRunningProcesses_v1 != nullptr;
 
             #undef LOAD_SYM
 
@@ -1265,6 +1347,8 @@ namespace Gpu {
 
 			if (device_count > 0) {
 				devices.resize(device_count);
+				process_last_timestamps.assign(device_count, 0ULL);
+				process_stats.clear();
 				gpus.resize(device_count);
 				gpu_names.resize(device_count);
 
@@ -1475,17 +1559,6 @@ namespace Gpu {
 					} else gpus_slice[i].decoder_utilization = (long long)utilization;
 				}
 
-    			//? TODO: Processes using GPU
-    				/*unsigned int proc_info_len;
-    				nvmlProcessInfo_t* proc_info = 0;
-    				result = nvmlDeviceGetComputeRunningProcesses_v3(device, &proc_info_len, proc_info);
-    				if (result != NVML_SUCCESS) {
-						Logger::warning("NVML: Failed to get compute processes: {}", nvmlErrorString(result));
-    				} else {
-    					for (unsigned int i = 0; i < proc_info_len; ++i)
-    						gpus_slice[i].graphics_processes.push_back({proc_info[i].pid, proc_info[i].usedGpuMemory});
-    				}*/
-
 				// nvTimer.stop_rename_reset("Nv pcie thread join");
 				//? Join PCIE TX/RX threads
 				if constexpr(is_init) { // there doesn't seem to be a better way to do this, but this should be fine considering it's just 2 lines
@@ -1499,6 +1572,123 @@ namespace Gpu {
 
 			return true;
 		}
+
+		bool collect_process_stats() {
+			if (not initialized or (not process_memory_functions_available and not process_utilization_function_available)) {
+				process_stats.clear();
+				return false;
+			}
+
+			process_stats.clear();
+
+			auto merge_process_memory = [&](unsigned int pid, unsigned long long used_mem, std::unordered_map<size_t, uint64_t>& mem_by_pid) {
+				if (used_mem == NVML_VALUE_NOT_AVAILABLE_ULL) return;
+				auto& mem = mem_by_pid[pid];
+				mem = max(mem, (uint64_t)used_mem);
+			};
+
+			auto append_process_memory_v3 = [&](nvmlDevice_t device, auto fn, std::unordered_map<size_t, uint64_t>& mem_by_pid) {
+				if (fn == nullptr) return;
+				unsigned int proc_count = 0;
+				auto result = fn(device, &proc_count, nullptr);
+				if (result == NVML_ERROR_NOT_FOUND) return;
+				if (result != NVML_SUCCESS and result != NVML_ERROR_INSUFFICIENT_SIZE) return;
+				if (proc_count == 0) return;
+
+				std::vector<nvmlProcessInfo_t> processes(proc_count);
+				result = fn(device, &proc_count, processes.data());
+				if (result != NVML_SUCCESS) return;
+
+				for (unsigned int n = 0; n < proc_count; ++n) {
+					merge_process_memory(processes[n].pid, processes[n].usedGpuMemory, mem_by_pid);
+				}
+			};
+
+			auto append_process_memory_v2 = [&](nvmlDevice_t device, auto fn, std::unordered_map<size_t, uint64_t>& mem_by_pid) {
+				if (fn == nullptr) return;
+				unsigned int proc_count = 0;
+				auto result = fn(device, &proc_count, nullptr);
+				if (result == NVML_ERROR_NOT_FOUND) return;
+				if (result != NVML_SUCCESS and result != NVML_ERROR_INSUFFICIENT_SIZE) return;
+				if (proc_count == 0) return;
+
+				std::vector<nvmlProcessInfo_v2_t> processes(proc_count);
+				result = fn(device, &proc_count, processes.data());
+				if (result != NVML_SUCCESS) return;
+
+				for (unsigned int n = 0; n < proc_count; ++n) {
+					merge_process_memory(processes[n].pid, processes[n].usedGpuMemory, mem_by_pid);
+				}
+			};
+
+			auto append_process_memory_v1 = [&](nvmlDevice_t device, auto fn, std::unordered_map<size_t, uint64_t>& mem_by_pid) {
+				if (fn == nullptr) return;
+				unsigned int proc_count = 0;
+				auto result = fn(device, &proc_count, nullptr);
+				if (result == NVML_ERROR_NOT_FOUND) return;
+				if (result != NVML_SUCCESS and result != NVML_ERROR_INSUFFICIENT_SIZE) return;
+				if (proc_count == 0) return;
+
+				std::vector<nvmlProcessInfo_v1_t> processes(proc_count);
+				result = fn(device, &proc_count, processes.data());
+				if (result != NVML_SUCCESS) return;
+
+				for (unsigned int n = 0; n < proc_count; ++n) {
+					merge_process_memory(processes[n].pid, processes[n].usedGpuMemory, mem_by_pid);
+				}
+			};
+
+			for (unsigned int i = 0; i < device_count; ++i) {
+				// Merge per-device process stats from whichever NVML API versions are available.
+				if (process_memory_functions_available) {
+					std::unordered_map<size_t, uint64_t> mem_by_pid;
+					append_process_memory_v3(devices[i], nvmlDeviceGetGraphicsRunningProcesses_v3, mem_by_pid);
+					append_process_memory_v3(devices[i], nvmlDeviceGetComputeRunningProcesses_v3, mem_by_pid);
+					append_process_memory_v2(devices[i], nvmlDeviceGetGraphicsRunningProcesses_v2, mem_by_pid);
+					append_process_memory_v2(devices[i], nvmlDeviceGetComputeRunningProcesses_v2, mem_by_pid);
+					append_process_memory_v1(devices[i], nvmlDeviceGetGraphicsRunningProcesses_v1, mem_by_pid);
+					append_process_memory_v1(devices[i], nvmlDeviceGetComputeRunningProcesses_v1, mem_by_pid);
+
+					for (const auto& [pid, mem] : mem_by_pid) {
+						process_stats[pid].mem += mem;
+					}
+				}
+
+				if (process_utilization_function_available) {
+					unsigned int sample_count = 64;
+					std::vector<nvmlProcessUtilizationSample_t> samples(sample_count);
+					auto result = nvmlDeviceGetProcessUtilization(devices[i], samples.data(), &sample_count, process_last_timestamps.at(i));
+
+					if (result == NVML_ERROR_INSUFFICIENT_SIZE and sample_count > 0) {
+						samples.resize(sample_count);
+						result = nvmlDeviceGetProcessUtilization(devices[i], samples.data(), &sample_count, process_last_timestamps.at(i));
+					}
+					if (result != NVML_SUCCESS) continue;
+
+					std::unordered_map<size_t, unsigned int> util_by_pid;
+					for (unsigned int n = 0; n < sample_count; ++n) {
+						const auto& sample = samples[n];
+						process_last_timestamps.at(i) = max(process_last_timestamps.at(i), sample.timeStamp);
+						auto& util = util_by_pid[sample.pid];
+						util = max(util, sample.smUtil);
+					}
+
+					for (const auto& [pid, util] : util_by_pid) {
+						process_stats[pid].gpu += util;
+					}
+				}
+			}
+
+			for (auto& [_, stat] : process_stats) {
+				stat.gpu = clamp(stat.gpu, 0.0, 100.0);
+			}
+
+			return true;
+		}
+
+		auto get_process_stats() -> const std::unordered_map<size_t, proc_stat>& {
+			return process_stats;
+		}
     }
 
 	//? AMD
@@ -2799,6 +2989,7 @@ namespace Proc {
 	string current_sort;
 	string current_filter;
 	bool current_rev{};
+	bool current_gpu_only{};
 	bool is_tree_mode;
 
 	fs::file_time_type passwd_time;
@@ -2806,6 +2997,7 @@ namespace Proc {
 	uint64_t cputimes;
 	int collapse = -1, expand = -1, toggle_children = -1;
 	uint64_t old_cputimes{};
+	uint64_t old_gputimes{};
 	atomic<int> numpids{};
 	int filter_found{};
 
@@ -2813,6 +3005,119 @@ namespace Proc {
 	constexpr size_t KTHREADD = 2;
 	static std::unordered_set<size_t> kernels_procs = {KTHREADD};
 	static std::unordered_set<size_t> dead_procs;
+	struct gpu_proc_info {
+		uint64_t busy_ns{};
+		uint64_t mem_bytes{};
+	};
+
+	static auto proc_gpu_info(const fs::path& pid_path) -> gpu_proc_info {
+		std::unordered_map<string, uint64_t> gpu_totals;
+		std::unordered_map<string, uint64_t> gpu_mems;
+
+		try {
+			for (const auto& d : fs::directory_iterator(pid_path / "fdinfo", fs::directory_options::skip_permission_denied)) {
+				ifstream fdread(d.path());
+				if (not fdread.good()) continue;
+
+				string driver, pdev, line;
+				std::unordered_map<string, uint64_t> drm_values;
+				std::unordered_map<string, uint64_t> engine_times;
+				std::unordered_map<string, uint64_t> engine_capacity;
+
+				while (getline(fdread, line)) {
+					auto split = line.find(':');
+					if (split == string::npos) continue;
+
+					const std::string_view key{line.data(), split};
+					const auto value = trim_view(std::string_view{line}.substr(split + 1));
+
+					if (key == "drm-driver") {
+						driver = string{value};
+						continue;
+					}
+					if (key == "drm-pdev") {
+						pdev = string{value};
+						continue;
+					}
+
+					uint64_t raw{};
+					if (not parse_u64_prefix(value, raw)) continue;
+					drm_values[string{key}] = raw;
+
+					if (not key.starts_with("drm-engine-")) continue;
+
+					auto engine = key.substr(11);
+
+					if (engine.starts_with("capacity-")) {
+						engine_capacity[string{engine.substr(9)}] = max((uint64_t)1, raw);
+					}
+					else {
+						engine_times[string{engine}] = raw;
+					}
+				}
+
+				auto val = [&](const string& key) { return drm_values.contains(key) ? drm_values.at(key) : 0ULL; };
+
+				uint64_t fd_total{};
+				uint64_t fd_mem{};
+				if (driver == "amdgpu") {
+					fd_total = val("drm-engine-compute") + val("drm-engine-gfx");
+					fd_mem = (val("drm-memory-gtt") + val("drm-memory-vram")) * 1024;
+				}
+				else if (driver == "i915") {
+					fd_total = val("drm-engine-render");
+					fd_mem = (val("drm-total-local0") + val("drm-total-system0")) * 1024;
+				}
+				else if (driver == "v3d") {
+					fd_total = val("drm-engine-render");
+					fd_mem = val("drm-total-memory") * 1024;
+				}
+				else if (driver == "xe") {
+					fd_total = val("drm-cycles-rcs") + val("drm-cycles-ccs");
+					fd_mem = (val("drm-total-gtt") + val("drm-total-vram0")) * 1024;
+				}
+				else {
+					for (const auto& [engine, usage] : engine_times) {
+						const auto cap_it = engine_capacity.find(engine);
+						const auto capacity = cap_it == engine_capacity.end() ? 1ULL : max((uint64_t)1, cap_it->second);
+						fd_total += usage / capacity;
+					}
+					uint64_t fd_mem_resident{};
+					uint64_t fd_mem_total{};
+					for (const auto& [key, value] : drm_values) {
+						if (key.starts_with("drm-resident-")) fd_mem_resident += value * 1024;
+						else if (key.starts_with("drm-total-")) fd_mem_total += value * 1024;
+					}
+					fd_mem = fd_mem_resident > 0 ? fd_mem_resident : fd_mem_total;
+				}
+
+				if (driver.empty() and (fd_total > 0 or fd_mem > 0)) driver = "unknown";
+				if (driver.empty() or (fd_total == 0 and fd_mem == 0)) continue;
+
+				const auto gpu_id = pdev.empty() ? driver : fmt::format("{}:{}", driver, pdev);
+				// Keep the highest per-GPU values to avoid double-counting shared fdinfo snapshots.
+				if (fd_total > 0 and (not gpu_totals.contains(gpu_id) or fd_total > gpu_totals.at(gpu_id))) {
+					gpu_totals[gpu_id] = fd_total;
+				}
+
+				if (fd_mem > 0 and (not gpu_mems.contains(gpu_id) or fd_mem > gpu_mems.at(gpu_id))) {
+					gpu_mems[gpu_id] = fd_mem;
+				}
+			}
+		}
+		catch (const fs::filesystem_error&) {
+			return {};
+		}
+
+		gpu_proc_info total{};
+		for (const auto& [_, usage] : gpu_totals) {
+			total.busy_ns += usage;
+		}
+		for (const auto& [_, usage] : gpu_mems) {
+			total.mem_bytes += usage;
+		}
+		return total;
+	}
 
 	//* Get detailed info for selected process
 	static void _collect_details(const size_t pid, const uint64_t uptime, vector<proc_info>& procs) {
@@ -2918,14 +3223,17 @@ namespace Proc {
 		const auto& sorting = Config::getS("proc_sorting");
 		auto reverse = Config::getB("proc_reversed");
 		const auto& filter = Config::getS("proc_filter");
+		const bool gpu_only = Config::getB("proc_gpu_only");
 		auto per_core = Config::getB("proc_per_core");
 		auto should_filter_kernel = Config::getB("proc_filter_kernel");
 		auto tree = Config::getB("proc_tree");
 		auto show_detailed = Config::getB("show_detailed");
 		const auto pause_proc_list = Config::getB("pause_proc_list");
 		const size_t detailed_pid = Config::getI("detailed_pid");
-		bool should_filter = current_filter != filter;
+		bool should_filter = current_filter != filter or current_gpu_only != gpu_only;
+		if (gpu_only) should_filter = true;
 		if (should_filter) current_filter = filter;
+		if (should_filter) current_gpu_only = gpu_only;
 		bool sorted_change = (sorting != current_sort or reverse != current_rev or should_filter);
 		bool tree_mode_change = tree != is_tree_mode;
 		if (sorted_change) {
@@ -2943,6 +3251,13 @@ namespace Proc {
 
 		const int cmult = (per_core) ? Shared::coreCount : 1;
 		bool got_detailed = false;
+		uint64_t gpu_timestamp{};
+		uint64_t gpu_time_passed{};
+		#if defined(GPU_SUPPORT)
+			const std::unordered_map<size_t, Gpu::Nvml::proc_stat>* nvml_proc_stats = nullptr;
+		#else
+			const void* nvml_proc_stats = nullptr;
+		#endif
 
 		static size_t proc_clear_count{};
 
@@ -2954,6 +3269,16 @@ namespace Proc {
 		else {
 			should_filter = true;
 			found.clear();
+			gpu_timestamp = static_cast<uint64_t>(get_monotonicTimeUSec()) * 1000ULL;
+			if (old_gputimes > 0 and gpu_timestamp > old_gputimes)
+				gpu_time_passed = gpu_timestamp - old_gputimes;
+
+			#if defined(GPU_SUPPORT)
+				if (Gpu::Nvml::initialized) {
+					Gpu::Nvml::collect_process_stats();
+					nvml_proc_stats = &Gpu::Nvml::get_process_stats();
+				}
+			#endif
 
 			//? First make sure kernel proc cache is cleared.
 			if (should_filter_kernel and ++proc_clear_count >= 256) {
@@ -3174,6 +3499,31 @@ namespace Proc {
 				//? Process cpu usage since last update
 				new_proc.cpu_p = clamp(round(cmult * 1000 * (cpu_t - new_proc.cpu_t) / max((uint64_t)1, cputimes - old_cputimes)) / 10.0, 0.0, 100.0 * Shared::coreCount);
 
+				bool used_nvml_gpu_stats = false;
+				#if defined(GPU_SUPPORT)
+					if (nvml_proc_stats != nullptr) {
+						auto found_proc = nvml_proc_stats->find(new_proc.pid);
+						if (found_proc != nvml_proc_stats->end()) {
+							new_proc.gpu_p = clamp(found_proc->second.gpu, 0.0, 100.0);
+							new_proc.gpu_m = found_proc->second.mem;
+							new_proc.gpu_t = 0;
+							used_nvml_gpu_stats = true;
+						}
+					}
+				#endif
+
+				if (not used_nvml_gpu_stats) {
+					//? Process GPU usage since last update from /proc/[pid]/fdinfo
+					const auto gpu_stats = proc_gpu_info(d.path());
+					const uint64_t gpu_t = gpu_stats.busy_ns;
+					if (gpu_time_passed > 0 and new_proc.gpu_t > 0 and gpu_t >= new_proc.gpu_t)
+						new_proc.gpu_p = clamp(round(1000.0 * (gpu_t - new_proc.gpu_t) / gpu_time_passed) / 10.0, 0.0, 100.0);
+					else
+						new_proc.gpu_p = 0.0;
+					new_proc.gpu_t = gpu_t;
+					new_proc.gpu_m = gpu_stats.mem_bytes;
+				}
+
 				//? Process cumulative cpu usage since process start
 				new_proc.cpu_c = (double)cpu_t / max(1.0, (uptime * Shared::clkTck) - new_proc.cpu_s);
 
@@ -3202,6 +3552,8 @@ namespace Proc {
 						//? Reset cpu usage for dead processes if paused and option is set
 						if (!keep_dead_proc_usage) {
 							r.cpu_p = 0.0;
+							r.gpu_p = 0.0;
+							r.gpu_m = 0;
 							r.mem = 0;
 						}
 					}
@@ -3218,6 +3570,7 @@ namespace Proc {
 			}
 
 			old_cputimes = cputimes;
+			old_gputimes = gpu_timestamp;
 		}
 		//* ---------------------------------------------Collection done-----------------------------------------------
 
@@ -3225,8 +3578,8 @@ namespace Proc {
 		if (should_filter) {
 			filter_found = 0;
 			for (auto& p : current_procs) {
-				if (not tree and not filter.empty()) {
-					if (!matches_filter(p, filter)) {
+				if ((not tree and not filter.empty()) or gpu_only) {
+					if (not matches_filter(p, filter)) {
 						p.filtered = true;
 						filter_found++;
 					} else {
diff --git a/src/linux/intel_gpu_top/CMakeLists.txt b/src/linux/intel_gpu_top/CMakeLists.txt
index e13e19397..533a1d8ac 100644
--- a/src/linux/intel_gpu_top/CMakeLists.txt
+++ b/src/linux/intel_gpu_top/CMakeLists.txt
@@ -7,6 +7,8 @@ add_library(igt OBJECT
   intel_name_lookup_shim.c
 )
 
+target_compile_definitions(igt PRIVATE _GNU_SOURCE)
+
 if(BTOP_LTO)
   # We have checked LTO support already and it's supported :)
   set_target_properties(igt PROPERTIES INTERPROCEDURAL_OPTIMIZATION ON)