diff --git a/CMakeLists.txt b/CMakeLists.txt
index bc3b1e63b..4ab545e14 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -174,6 +174,8 @@ if(KOKKOSTOOLS_HAS_VARIORUM)
   add_subdirectory(profiling/variorum-connector)
 endif()
 
+add_subdirectory(profiling/energy-profiler)
+  
 # GPU profilers
 if(Kokkos_ENABLE_CUDA)
   add_subdirectory(profiling/nvtx-connector)
diff --git a/profiling/energy-profiler/CMakeLists.txt b/profiling/energy-profiler/CMakeLists.txt
new file mode 100644
index 000000000..535cd8e9a
--- /dev/null
+++ b/profiling/energy-profiler/CMakeLists.txt
@@ -0,0 +1,2 @@
+add_subdirectory(tests)
+add_subdirectory(kokkos)
\ No newline at end of file
diff --git a/profiling/energy-profiler/common/daemon.cpp b/profiling/energy-profiler/common/daemon.cpp
new file mode 100644
index 000000000..b4487eb75
--- /dev/null
+++ b/profiling/energy-profiler/common/daemon.cpp
@@ -0,0 +1,43 @@
+#include "daemon.hpp"
+#include <stdexcept>
+#include <thread>
+
+void Daemon::start() {
+  if (!running_) {
+    running_ = true;
+    thread_  = std::thread(&Daemon::tick, this);
+  } else {
+    throw std::runtime_error("Daemon already started");
+  }
+}
+
+void Daemon::tick() {
+  while (running_) {
+    std::chrono::high_resolution_clock::time_point start_time =
+        std::chrono::high_resolution_clock::now();
+
+    // Execute the function
+    func_();
+
+    std::chrono::high_resolution_clock::time_point end_time =
+        std::chrono::high_resolution_clock::now();
+    std::chrono::milliseconds execution_duration =
+        std::chrono::duration_cast<std::chrono::milliseconds>(end_time -
+                                                              start_time);
+
+    // Calculate how long to sleep to maintain the interval
+    if (execution_duration < interval_) {
+      std::chrono::milliseconds sleep_duration = interval_ - execution_duration;
+      std::this_thread::sleep_for(sleep_duration);
+    }
+  }
+}
+
+void Daemon::stop() {
+  if (running_) {
+    running_ = false;
+    thread_.join();
+  } else {
+    throw std::runtime_error("Daemon not started");
+  }
+}
\ No newline at end of file
diff --git a/profiling/energy-profiler/common/daemon.hpp b/profiling/energy-profiler/common/daemon.hpp
new file mode 100644
index 000000000..ad9188452
--- /dev/null
+++ b/profiling/energy-profiler/common/daemon.hpp
@@ -0,0 +1,23 @@
+#pragma once
+
+#include <functional>
+#include <thread>
+#include <chrono>
+
+class Daemon {
+ public:
+  Daemon(std::function<void()> func, int interval_ms)
+      : interval_(interval_ms), func_(func) {};
+
+  void start();
+  void tick();
+  void stop();
+  bool is_running() const { return running_; }
+  std::thread& get_thread() { return thread_; }
+
+ private:
+  std::chrono::milliseconds interval_;
+  bool running_{false};
+  std::function<void()> func_;
+  std::thread thread_;
+};
\ No newline at end of file
diff --git a/profiling/energy-profiler/common/filename_prefix.cpp b/profiling/energy-profiler/common/filename_prefix.cpp
new file mode 100644
index 000000000..294acb82c
--- /dev/null
+++ b/profiling/energy-profiler/common/filename_prefix.cpp
@@ -0,0 +1,8 @@
+#include "filename_prefix.hpp"
+
+std::string generate_prefix() {
+  char hostname[256];
+  gethostname(hostname, 256);
+  int pid = (int)getpid();
+  return std::string(hostname) + "-" + std::to_string(pid);
+}
\ No newline at end of file
diff --git a/profiling/energy-profiler/common/filename_prefix.hpp b/profiling/energy-profiler/common/filename_prefix.hpp
new file mode 100644
index 000000000..93b02371e
--- /dev/null
+++ b/profiling/energy-profiler/common/filename_prefix.hpp
@@ -0,0 +1,6 @@
+#pragma once
+
+#include <string>
+#include <unistd.h>
+
+std::string generate_prefix();
\ No newline at end of file
diff --git a/profiling/energy-profiler/common/timer.cpp b/profiling/energy-profiler/common/timer.cpp
new file mode 100644
index 000000000..0b9401440
--- /dev/null
+++ b/profiling/energy-profiler/common/timer.cpp
@@ -0,0 +1,244 @@
+//@HEADER
+// ************************************************************************
+//
+//                        Kokkos v. 4.0
+//       Copyright (2022) National Technology & Engineering
+//               Solutions of Sandia, LLC (NTESS).
+//
+// Under the terms of Contract DE-NA0003525 with NTESS,
+// the U.S. Government retains certain rights in this software.
+//
+// Part of Kokkos, under the Apache License v2.0 with LLVM Exceptions.
+// See https://kokkos.org/LICENSE for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//@HEADER
+
+#include "timer.hpp"
+#include <iostream>
+#include <deque>
+#include <cstdio>
+
+// EnergyTiming implementations
+EnergyTiming::EnergyTiming()
+    : timing_id_(0), name_(""), region_type_(RegionType::Unknown) {
+  start_time_ = std::chrono::high_resolution_clock::now();
+}
+
+EnergyTiming::EnergyTiming(uint64_t timing_id, RegionType type,
+                           std::string name)
+    : timing_id_(timing_id), name_(name), region_type_(type) {
+  start_time_ = std::chrono::high_resolution_clock::now();
+}
+
+void EnergyTiming::end() {
+  end_time_ = std::chrono::high_resolution_clock::now();
+}
+
+bool EnergyTiming::is_ended() const {
+  return end_time_ !=
+         std::chrono::time_point<std::chrono::high_resolution_clock>{};
+}
+
+uint64_t EnergyTiming::get_duration_ms() const {
+  auto duration = std::chrono::duration_cast<std::chrono::milliseconds>(
+      end_time_ - start_time_);
+  return static_cast<uint64_t>(duration.count());
+}
+
+// EnergyTimer implementations
+void EnergyTimer::start_timing(uint64_t timing_id, RegionType type,
+                               std::string name) {
+  timings_.emplace(timing_id, EnergyTiming(timing_id, type, name));
+}
+
+void EnergyTimer::end_timing(uint64_t timing_id) {
+  auto it = timings_.find(timing_id);
+  if (it != timings_.end()) {
+    it->second.end();
+  }
+}
+
+std::unordered_map<uint64_t, EnergyTiming>& EnergyTimer::get_timings() {
+  return timings_;
+}
+
+namespace KokkosTools {
+namespace Timer {
+
+void export_kernels_csv(const std::deque<TimingInfo>& timings,
+                        const std::string& filename) {
+  if (timings.empty()) return;
+
+  FILE* file = fopen(filename.c_str(), "w");
+  if (file) {
+    fprintf(file,
+            "name,type,start_time_epoch_ms,end_time_epoch_ms,duration_ms\n");
+    for (const auto& timing : timings) {
+      auto start_ms = std::chrono::duration_cast<std::chrono::milliseconds>(
+                          timing.start_time.time_since_epoch())
+                          .count();
+      auto end_ms = std::chrono::duration_cast<std::chrono::milliseconds>(
+                        timing.end_time.time_since_epoch())
+                        .count();
+      auto duration_ms = timing.duration.count() / 1000000;
+
+      std::string type;
+      switch (timing.type) {
+        case RegionType::ParallelFor: type = "parallel_for"; break;
+        case RegionType::ParallelScan: type = "parallel_scan"; break;
+        case RegionType::ParallelReduce: type = "parallel_reduce"; break;
+        default: type = "unknown";
+      }
+
+      fprintf(file, "%s,%s,%ld,%ld,%ld\n", timing.name.c_str(), type.c_str(),
+              start_ms, end_ms, duration_ms);
+    }
+    fclose(file);
+    std::cout << "Timing data exported to " << filename << std::endl;
+  } else {
+    std::cerr << "ERROR: Unable to open file " << filename << " for writing.\n";
+  }
+}
+
+void export_regions_csv(const std::deque<TimingInfo>& timings,
+                        const std::string& filename) {
+  if (timings.empty()) return;
+
+  FILE* file = fopen(filename.c_str(), "w");
+  if (file) {
+    fprintf(file, "name,start_time_epoch_ms,end_time_epoch_ms,duration_ms\n");
+    for (const auto& timing : timings) {
+      auto start_ms = std::chrono::duration_cast<std::chrono::milliseconds>(
+                          timing.start_time.time_since_epoch())
+                          .count();
+      auto end_ms = std::chrono::duration_cast<std::chrono::milliseconds>(
+                        timing.end_time.time_since_epoch())
+                        .count();
+      auto duration_ms = timing.duration.count() / 1000000;
+
+      fprintf(file, "%s,%ld,%ld,%ld\n", timing.name.c_str(), start_ms, end_ms,
+              duration_ms);
+    }
+    fclose(file);
+    std::cout << "Region data exported to " << filename << std::endl;
+  } else {
+    std::cerr << "ERROR: Unable to open file " << filename << " for writing.\n";
+  }
+}
+
+void export_deepcopies_csv(const std::deque<TimingInfo>& timings,
+                           const std::string& filename) {
+  if (timings.empty()) return;
+
+  FILE* file = fopen(filename.c_str(), "w");
+  if (file) {
+    fprintf(file, "name,start_time_epoch_ms,end_time_epoch_ms,duration_ms\n");
+    for (const auto& timing : timings) {
+      auto start_ms = std::chrono::duration_cast<std::chrono::milliseconds>(
+                          timing.start_time.time_since_epoch())
+                          .count();
+      auto end_ms = std::chrono::duration_cast<std::chrono::milliseconds>(
+                        timing.end_time.time_since_epoch())
+                        .count();
+      auto duration_ms = timing.duration.count() / 1000000;
+
+      fprintf(file, "%s,%ld,%ld,%ld\n", timing.name.c_str(), start_ms, end_ms,
+              duration_ms);
+    }
+    fclose(file);
+    std::cout << "Deep copy data exported to " << filename << std::endl;
+  } else {
+    std::cerr << "ERROR: Unable to open file " << filename << " for writing.\n";
+  }
+}
+
+void print_kernels_summary(const std::deque<TimingInfo>& kernels) {
+  std::cout << "\n==== KERNELS ====\n";
+  std::cout << "| Name                                 | Type           | "
+               "Start(ms)         | End(ms)           | Duration (ms) |\n";
+  std::cout << "|--------------------------------------|----------------|------"
+               "-------------|-------------------|---------------|\n";
+  for (const auto& info : kernels) {
+    std::string type;
+    switch (info.type) {
+      case RegionType::ParallelFor: type = "parallel_for"; break;
+      case RegionType::ParallelScan: type = "parallel_scan"; break;
+      case RegionType::ParallelReduce: type = "parallel_reduce"; break;
+      default: type = "unknown";
+    }
+    auto start_ms = std::chrono::duration_cast<std::chrono::milliseconds>(
+                        info.start_time.time_since_epoch())
+                        .count();
+    auto end_ms = std::chrono::duration_cast<std::chrono::milliseconds>(
+                      info.end_time.time_since_epoch())
+                      .count();
+    std::cout
+        << "| " << info.name
+        << std::string(38 - std::min<size_t>(info.name.size(), 38), ' ') << "| "
+        << type << std::string(16 - type.size(), ' ') << "| " << start_ms
+        << std::string(19 - std::to_string(start_ms).size(), ' ') << "| "
+        << end_ms << std::string(19 - std::to_string(end_ms).size(), ' ')
+        << "| " << (info.duration.count() / 1000000)
+        << std::string(
+               13 - std::to_string(info.duration.count() / 1000000).size(), ' ')
+        << "|\n";
+  }
+}
+
+void print_regions_summary(const std::deque<TimingInfo>& regions) {
+  std::cout << "\n==== REGIONS ====\n";
+  std::cout << "| Name                                 | Start(ms)         | "
+               "End(ms)           | Duration (ms) |\n";
+  std::cout << "|--------------------------------------|-------------------|---"
+               "----------------|---------------|\n";
+  for (const auto& info : regions) {
+    auto start_ms = std::chrono::duration_cast<std::chrono::milliseconds>(
+                        info.start_time.time_since_epoch())
+                        .count();
+    auto end_ms = std::chrono::duration_cast<std::chrono::milliseconds>(
+                      info.end_time.time_since_epoch())
+                      .count();
+    std::cout << "| " << info.name
+              << std::string(38 - std::min<size_t>(info.name.size(), 38), ' ')
+              << "| " << start_ms
+              << std::string(19 - std::to_string(start_ms).size(), ' ') << "| "
+              << end_ms << std::string(19 - std::to_string(end_ms).size(), ' ')
+              << "| " << (info.duration.count() / 1000000)
+              << std::string(
+                     13 -
+                         std::to_string(info.duration.count() / 1000000).size(),
+                     ' ')
+              << "|\n";
+  }
+}
+
+void print_deepcopies_summary(const std::deque<TimingInfo>& deepcopies) {
+  std::cout << "\n==== DEEP COPIES ====\n";
+  std::cout << "| Name                                 | Start(ms)         | "
+               "End(ms)           | Duration (ms) |\n";
+  std::cout << "|--------------------------------------|-------------------|---"
+               "----------------|---------------|\n";
+  for (const auto& info : deepcopies) {
+    auto start_ms = std::chrono::duration_cast<std::chrono::milliseconds>(
+                        info.start_time.time_since_epoch())
+                        .count();
+    auto end_ms = std::chrono::duration_cast<std::chrono::milliseconds>(
+                      info.end_time.time_since_epoch())
+                      .count();
+    std::cout << "| " << info.name
+              << std::string(38 - std::min<size_t>(info.name.size(), 38), ' ')
+              << "| " << start_ms
+              << std::string(19 - std::to_string(start_ms).size(), ' ') << "| "
+              << end_ms << std::string(19 - std::to_string(end_ms).size(), ' ')
+              << "| " << (info.duration.count() / 1000000)
+              << std::string(
+                     13 -
+                         std::to_string(info.duration.count() / 1000000).size(),
+                     ' ')
+              << "|\n";
+  }
+}
+
+}  // namespace Timer
+}  // namespace KokkosTools
diff --git a/profiling/energy-profiler/common/timer.hpp b/profiling/energy-profiler/common/timer.hpp
new file mode 100644
index 000000000..bb4ff8ac3
--- /dev/null
+++ b/profiling/energy-profiler/common/timer.hpp
@@ -0,0 +1,69 @@
+#pragma once
+
+#include <chrono>
+#include <cstdint>
+#include <string>
+#include <unordered_map>
+#include <deque>
+
+enum class RegionType {
+  Unknown,
+  ParallelFor,
+  ParallelReduce,
+  ParallelScan,
+  DeepCopy,
+  UserRegion
+};
+
+struct TimingInfo {
+  std::string name;
+  RegionType type;
+  std::chrono::high_resolution_clock::time_point start_time;
+  std::chrono::high_resolution_clock::time_point end_time;
+  std::chrono::nanoseconds duration;
+  uint64_t id = 0;
+};
+
+struct EnergyTiming {
+  // Default constructor
+  EnergyTiming();
+
+  EnergyTiming(uint64_t timing_id, RegionType type, std::string name);
+
+  void end();
+
+  bool is_ended() const;
+
+  uint64_t get_duration_ms() const;
+
+  uint64_t timing_id_;
+  std::string name_;
+  RegionType region_type_;
+  std::chrono::time_point<std::chrono::high_resolution_clock> start_time_;
+  std::chrono::time_point<std::chrono::high_resolution_clock> end_time_;
+};
+
+struct EnergyTimer {
+ public:
+  void start_timing(uint64_t timing_id, RegionType type, std::string name);
+  void end_timing(uint64_t timing_id);
+  std::unordered_map<uint64_t, EnergyTiming>& get_timings();
+
+ private:
+  std::unordered_map<uint64_t, EnergyTiming> timings_;
+};
+
+// CSV Export functions for TimingInfo
+namespace KokkosTools {
+namespace Timer {
+void export_kernels_csv(const std::deque<TimingInfo>& timings,
+                        const std::string& filename);
+void export_regions_csv(const std::deque<TimingInfo>& timings,
+                        const std::string& filename);
+void export_deepcopies_csv(const std::deque<TimingInfo>& timings,
+                           const std::string& filename);
+void print_kernels_summary(const std::deque<TimingInfo>& kernels);
+void print_regions_summary(const std::deque<TimingInfo>& regions);
+void print_deepcopies_summary(const std::deque<TimingInfo>& deepcopies);
+}  // namespace Timer
+}  // namespace KokkosTools
\ No newline at end of file
diff --git a/profiling/energy-profiler/common/tool_interface.hpp b/profiling/energy-profiler/common/tool_interface.hpp
new file mode 100644
index 000000000..2aef08b5f
--- /dev/null
+++ b/profiling/energy-profiler/common/tool_interface.hpp
@@ -0,0 +1,31 @@
+#pragma once
+
+#include <cstdint>
+#include "kp_core.hpp"
+
+class ToolInterface {
+ public:
+  ToolInterface()          = default;
+  virtual ~ToolInterface() = default;
+  virtual void init_library(const int loadSeq, const uint64_t interfaceVer,
+                            const uint32_t devInfoCount,
+                            Kokkos_Profiling_KokkosPDeviceInfo* deviceInfo) = 0;
+  virtual void finalize_library()                                           = 0;
+  virtual void begin_parallel_for(const char* name, const uint32_t devID,
+                                  uint64_t kID)                             = 0;
+  virtual void end_parallel_for(uint64_t kID)                               = 0;
+  virtual void begin_parallel_scan(const char* name, const uint32_t devID,
+                                   uint64_t* kID)                           = 0;
+  virtual void end_parallel_scan(uint64_t kID)                              = 0;
+  virtual void begin_parallel_reduce(const char* name, const uint32_t devID,
+                                     uint64_t* kID)                         = 0;
+  virtual void end_parallel_reduce(uint64_t kID)                            = 0;
+  virtual void begin_deep_copy(Kokkos::Tools::SpaceHandle dst_handle,
+                               const char* dst_name, const void* dst_ptr,
+                               Kokkos::Tools::SpaceHandle src_handle,
+                               const char* src_name, const void* src_ptr,
+                               uint64_t size)                               = 0;
+  virtual void end_deep_copy()                                              = 0;
+  virtual void push_profile_region(const char* region_name)                 = 0;
+  virtual void pop_profile_region()                                         = 0;
+};
\ No newline at end of file
diff --git a/profiling/energy-profiler/kokkos/CMakeLists.txt b/profiling/energy-profiler/kokkos/CMakeLists.txt
new file mode 100644
index 000000000..087194a56
--- /dev/null
+++ b/profiling/energy-profiler/kokkos/CMakeLists.txt
@@ -0,0 +1,81 @@
+# Find Threads package for pthread support (needed by std::thread in daemon.cpp)
+find_package(Threads REQUIRED)
+
+kp_add_library(kp_energy_kernel_timer kp_energy_kernel_timer.cpp
+    ../common/timer.cpp
+    ../common/filename_prefix.cpp
+    ../tools/kernel_timer_tool.cpp
+)
+
+target_include_directories(kp_energy_kernel_timer PRIVATE
+    ${CMAKE_CURRENT_SOURCE_DIR}
+)
+
+if(KOKKOSTOOLS_HAS_VARIORUM)
+    kp_add_library(kp_variorum_power kp_variorum_power.cpp
+    ../common/daemon.cpp
+    ../common/filename_prefix.cpp
+    ../common/timer.cpp
+    ../provider/provider_variorum.cpp
+    ../tools/kernel_timer_tool.cpp
+)
+
+# Find Threads package for pthread support (needed by std::thread in daemon.cpp)
+find_package(Threads REQUIRED)
+target_link_libraries(kp_variorum_power PRIVATE variorum::variorum Threads::Threads)
+endif()
+
+find_package(CUDAToolkit QUIET)
+
+if (CUDAToolkit_FOUND)
+    find_package(CUDA::nvml QUIET)
+    if(TARGET CUDA::nvml)
+        message(STATUS "Found CUDA NVML, making NVML power profiler available.")
+    else()
+        message(STATUS "CUDA::nvml target not found, skipping NVML power profiler.")
+        return()
+    endif()
+else()
+    message(STATUS "CUDAToolkit not found, skipping NVML power profiler.")
+    return()
+endif()
+
+kp_add_library(kp_nvml_power kp_nvml_power.cpp
+    ../common/daemon.cpp
+    ../common/filename_prefix.cpp
+    ../common/timer.cpp
+    ../provider/provider_nvml.cpp
+    ../tools/kernel_timer_tool.cpp
+)
+target_link_libraries(kp_nvml_power PRIVATE CUDA::nvml Threads::Threads)
+
+target_include_directories(kp_nvml_power PRIVATE
+    ${CMAKE_CURRENT_SOURCE_DIR}
+)
+
+# NVML Direct Power Tool
+kp_add_library(kp_nvml_direct_power kp_nvml_direct_power.cpp
+    ../common/daemon.cpp
+    ../common/filename_prefix.cpp
+    ../common/timer.cpp
+    ../provider/provider_nvml.cpp
+    ../tools/kernel_timer_tool.cpp
+)
+target_link_libraries(kp_nvml_direct_power PRIVATE CUDA::nvml Threads::Threads)
+
+target_include_directories(kp_nvml_direct_power PRIVATE
+    ${CMAKE_CURRENT_SOURCE_DIR}
+)
+
+# NVML Energy Consumption Tool (no daemon needed)
+kp_add_library(kp_nvml_energy_consumption kp_nvml_energy_consumption.cpp
+    ../common/filename_prefix.cpp
+    ../common/timer.cpp
+    ../provider/provider_nvml.cpp
+    ../tools/kernel_timer_tool.cpp
+)
+target_link_libraries(kp_nvml_energy_consumption PRIVATE CUDA::nvml)
+
+target_include_directories(kp_nvml_energy_consumption PRIVATE
+    ${CMAKE_CURRENT_SOURCE_DIR}
+)
\ No newline at end of file
diff --git a/profiling/energy-profiler/kokkos/kp_energy_kernel_timer.cpp b/profiling/energy-profiler/kokkos/kp_energy_kernel_timer.cpp
new file mode 100644
index 000000000..8be118d20
--- /dev/null
+++ b/profiling/energy-profiler/kokkos/kp_energy_kernel_timer.cpp
@@ -0,0 +1,200 @@
+//@HEADER
+// ************************************************************************
+//
+//                        Kokkos v. 4.0
+//       Copyright (2022) National Technology & Engineering
+//               Solutions of Sandia, LLC (NTESS).
+//
+// Under the terms of Contract DE-NA0003525 with NTESS,
+// the U.S. Government retains certain rights in this software.
+//
+// Part of Kokkos, under the Apache License v2.0 with LLVM Exceptions.
+// See https://kokkos.org/LICENSE for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//@HEADER
+
+/**
+ * Kokkos Power Profiler - Specialized for Variorum
+ * Simplified version focused on Variorum energy monitoring with integrated
+ * timing
+ */
+
+#include <cstring>
+#include <iostream>
+
+#include "kp_core.hpp"
+#include "../tools/kernel_timer_tool.hpp"
+#include "../common/filename_prefix.hpp"
+#include "../common/timer.hpp"
+
+namespace KokkosTools {
+namespace KernelTimer {
+
+// --- Core Initialization ---
+KernelTimerTool timer;
+
+bool VERBOSE = false;
+std::string KOKKOS_PROFILE_LIBRARY_NAME =
+    "Kokkos Kernel Timer for Energy Profiler";
+
+// --- Library Initialization/Finalization ---
+
+void kokkosp_init_library(const int loadSeq, const uint64_t interfaceVer,
+                          const uint32_t devInfoCount,
+                          Kokkos_Profiling_KokkosPDeviceInfo* deviceInfo) {
+  timer.init_library(loadSeq, interfaceVer, devInfoCount, deviceInfo);
+}
+
+void kokkosp_finalize_library() {
+  std::cout << "Kokkos Power Profiler: Finalizing library\n";
+  timer.finalize_library();
+  std::cout << "Kokkos Power Profiler: Library finalized\n";
+
+  std::string prefix = generate_prefix();
+
+  const auto& kernels = timer.get_kernel_timings();
+  KokkosTools::Timer::print_kernels_summary(kernels);
+  KokkosTools::Timer::export_kernels_csv(kernels, prefix + "_kernels.csv");
+
+  // Récapitulatif des régions
+  const auto& regions = timer.get_region_timings();
+  KokkosTools::Timer::print_regions_summary(regions);
+  KokkosTools::Timer::export_regions_csv(regions, prefix + "_regions.csv");
+
+  // Récapitulatif des deep copies
+  const auto& deepcopies = timer.get_deep_copy_timings();
+  KokkosTools::Timer::print_deepcopies_summary(deepcopies);
+  KokkosTools::Timer::export_deepcopies_csv(deepcopies,
+                                            prefix + "_deepcopies.csv");
+}
+
+// --- Kernels Launch/End ---
+
+void kokkosp_begin_parallel_for(const char* name, const uint32_t devID,
+                                uint64_t* kID) {
+  timer.begin_parallel_for(name, devID, *kID);
+  if (VERBOSE) {
+    std::cout << "Kokkos Power Profiler: Started parallel_for '" << name
+              << "' on device " << devID << " with ID " << *kID << "\n";
+  }
+}
+
+void kokkosp_end_parallel_for(const uint64_t kID) {
+  timer.end_parallel_for(kID);
+  if (VERBOSE) {
+    std::cout << "Kokkos Power Profiler: Ended parallel_for with ID " << kID
+              << "\n";
+  }
+}
+
+void kokkosp_begin_parallel_scan(const char* name, const uint32_t devID,
+                                 uint64_t* kID) {
+  timer.begin_parallel_scan(name, devID, kID);
+  if (VERBOSE) {
+    std::cout << "Kokkos Power Profiler: Started parallel_scan '" << name
+              << "' on device " << devID << " with ID " << *kID << "\n";
+  }
+}
+
+void kokkosp_end_parallel_scan(const uint64_t kID) {
+  timer.end_parallel_scan(kID);
+  if (VERBOSE) {
+    std::cout << "Kokkos Power Profiler: Ended parallel_scan with ID " << kID
+              << "\n";
+  }
+}
+
+void kokkosp_begin_parallel_reduce(const char* name, const uint32_t devID,
+                                   uint64_t* kID) {
+  timer.begin_parallel_reduce(name, devID, kID);
+  if (VERBOSE) {
+    std::cout << "Kokkos Power Profiler: Started parallel_reduce '" << name
+              << "' on device " << devID << " with ID " << *kID << "\n";
+  }
+}
+
+void kokkosp_end_parallel_reduce(const uint64_t kID) {
+  timer.end_parallel_reduce(kID);
+  if (VERBOSE) {
+    std::cout << "Kokkos Power Profiler: Ended parallel_reduce with ID " << kID
+              << "\n";
+  }
+}
+
+void kokkosp_push_profile_region(char const* regionName) {
+  timer.push_profile_region(regionName);
+  if (VERBOSE) {
+    std::cout << "Kokkos Power Profiler: Pushed profile region '" << regionName
+              << "'\n";
+  }
+}
+
+void kokkosp_pop_profile_region() {
+  timer.pop_profile_region();
+  if (VERBOSE) {
+    std::cout << "Kokkos Power Profiler: Popped profile region\n";
+  }
+}
+
+void kokkosp_begin_deep_copy(Kokkos::Tools::SpaceHandle dst_handle,
+                             const char* dst_name, const void* dst_ptr,
+                             Kokkos::Tools::SpaceHandle src_handle,
+                             const char* src_name, const void* src_ptr,
+                             uint64_t size) {
+  timer.begin_deep_copy(dst_handle, dst_name, dst_ptr, src_handle, src_name,
+                        src_ptr, size);
+  if (VERBOSE) {
+    std::cout << "Kokkos Power Profiler: Started deep copy from '" << src_name
+              << "' to '" << dst_name << "' of size " << size << " bytes\n";
+  }
+}
+
+void kokkosp_end_deep_copy() {
+  timer.end_deep_copy();
+  if (VERBOSE) {
+    std::cout << "Kokkos Power Profiler: Ended deep copy\n";
+  }
+}
+
+// --- Event Set Configuration ---
+
+Kokkos::Tools::Experimental::EventSet get_event_set() {
+  Kokkos::Tools::Experimental::EventSet my_event_set;
+  memset(&my_event_set, 0,
+         sizeof(my_event_set));  // zero any pointers not set here
+  my_event_set.init                  = kokkosp_init_library;
+  my_event_set.finalize              = kokkosp_finalize_library;
+  my_event_set.begin_deep_copy       = kokkosp_begin_deep_copy;
+  my_event_set.end_deep_copy         = kokkosp_end_deep_copy;
+  my_event_set.begin_parallel_for    = kokkosp_begin_parallel_for;
+  my_event_set.begin_parallel_reduce = kokkosp_begin_parallel_reduce;
+  my_event_set.begin_parallel_scan   = kokkosp_begin_parallel_scan;
+  my_event_set.end_parallel_for      = kokkosp_end_parallel_for;
+  my_event_set.end_parallel_reduce   = kokkosp_end_parallel_reduce;
+  my_event_set.end_parallel_scan     = kokkosp_end_parallel_scan;
+  my_event_set.push_region           = kokkosp_push_profile_region;
+  my_event_set.pop_region            = kokkosp_pop_profile_region;
+  return my_event_set;
+}
+
+}  // namespace KernelTimer
+}  // namespace KokkosTools
+
+extern "C" {
+
+namespace impl = KokkosTools::KernelTimer;
+
+EXPOSE_INIT(impl::kokkosp_init_library)
+EXPOSE_FINALIZE(impl::kokkosp_finalize_library)
+EXPOSE_BEGIN_PARALLEL_FOR(impl::kokkosp_begin_parallel_for)
+EXPOSE_END_PARALLEL_FOR(impl::kokkosp_end_parallel_for)
+EXPOSE_BEGIN_PARALLEL_SCAN(impl::kokkosp_begin_parallel_scan)
+EXPOSE_END_PARALLEL_SCAN(impl::kokkosp_end_parallel_scan)
+EXPOSE_BEGIN_PARALLEL_REDUCE(impl::kokkosp_begin_parallel_reduce)
+EXPOSE_END_PARALLEL_REDUCE(impl::kokkosp_end_parallel_reduce)
+EXPOSE_PUSH_REGION(impl::kokkosp_push_profile_region)
+EXPOSE_POP_REGION(impl::kokkosp_pop_profile_region)
+EXPOSE_BEGIN_DEEP_COPY(impl::kokkosp_begin_deep_copy)
+EXPOSE_END_DEEP_COPY(impl::kokkosp_end_deep_copy)
+}
diff --git a/profiling/energy-profiler/kokkos/kp_nvml_direct_power.cpp b/profiling/energy-profiler/kokkos/kp_nvml_direct_power.cpp
new file mode 100644
index 000000000..a2160a3f3
--- /dev/null
+++ b/profiling/energy-profiler/kokkos/kp_nvml_direct_power.cpp
@@ -0,0 +1,356 @@
+//@HEADER
+// ************************************************************************
+//
+//                        Kokkos Direct Power Profiler
+//
+// Under the terms of Contract DE-NA0003525 with NTESS,
+// the U.S. Government retains certain rights in this software.
+//
+// Part of Kokkos, under the Apache License v2.0 with LLVM Exceptions.
+// See https://kokkos.org/LICENSE for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//@HEADER
+
+/**
+ * @file kp_nvml_direct_power.cpp
+ * @brief Kokkos Direct Power Profiler Tool using NVML.
+ *
+ * This tool leverages a background daemon to periodically sample GPU power
+ * consumption using the NVML library's direct power measurement API. It starts
+ * monitoring when the Kokkos library is initialized and prints a detailed power
+ * profile upon finalization.
+ */
+
+#include <iostream>
+#include <vector>
+#include <string>
+#include <chrono>
+#include <mutex>
+#include <iomanip>
+#include <cmath>
+#include <fstream>
+#include <memory>
+
+#include "kp_core.hpp"
+#include "../common/daemon.hpp"
+#include "../provider/provider_nvml.hpp"
+#include "../common/filename_prefix.hpp"
+#include "../common/timer.hpp"
+#include "../tools/kernel_timer_tool.hpp"
+
+namespace KokkosTools {
+namespace DirectPower {
+
+// --- Configuration ---
+// The interval in milliseconds for power sampling.
+constexpr int SAMPLING_INTERVAL_MS = 20;
+
+// --- Global State for the Profiler ---
+static std::unique_ptr<Daemon> g_power_daemon;
+static std::unique_ptr<NVMLProvider> g_nvml_provider;
+
+// Timer tool for kernel and region timing
+static KernelTimerTool g_timer;
+
+// Structure to store a single power measurement with a timestamp per device.
+struct DirectPowerSample {
+  std::chrono::high_resolution_clock::time_point timestamp;
+  std::vector<double> device_powers_watts;  // Power for each device
+};
+
+// Thread-safe storage for collected power samples.
+static std::vector<DirectPowerSample> g_power_samples;
+static std::mutex g_samples_mutex;
+static std::chrono::high_resolution_clock::time_point g_start_time;
+static size_t g_device_count = 0;
+
+/**
+ * @brief The function executed by the daemon thread to sample power.
+ *
+ * This function is called periodically. It fetches the current direct power
+ * usage from each GPU device using the NVML provider and stores it with a
+ * timestamp.
+ */
+void power_monitoring_tick() {
+  if (!g_nvml_provider || !g_nvml_provider->is_initialized()) {
+    return;
+  }
+
+  DirectPowerSample sample;
+  sample.timestamp = std::chrono::high_resolution_clock::now();
+  sample.device_powers_watts.reserve(g_device_count);
+
+  // Collect power for each device
+  for (size_t i = 0; i < g_device_count; ++i) {
+    double power = g_nvml_provider->get_device_power_usage_direct(i);
+    sample.device_powers_watts.push_back(power);
+  }
+
+  std::lock_guard<std::mutex> lock(g_samples_mutex);
+  g_power_samples.push_back(std::move(sample));
+}
+
+/**
+ * @brief Calculates statistics from the collected power samples.
+ *
+ * @param samples A constant reference to the vector of power samples.
+ * @param device_index The device index to analyze.
+ * @param[out] avg_power Average power consumption.
+ * @param[out] min_power Minimum power consumption.
+ * @param[out] max_power Maximum power consumption.
+ * @param[out] total_energy Total energy consumed in Joules.
+ */
+void analyze_device_power_data(const std::vector<DirectPowerSample>& samples,
+                               size_t device_index, double& avg_power,
+                               double& min_power, double& max_power,
+                               double& total_energy) {
+  if (samples.empty() || device_index >= g_device_count) {
+    avg_power = min_power = max_power = total_energy = 0.0;
+    return;
+  }
+
+  // Find first valid sample for this device
+  size_t first_valid = 0;
+  while (first_valid < samples.size() &&
+         (device_index >= samples[first_valid].device_powers_watts.size() ||
+          samples[first_valid].device_powers_watts[device_index] < 0)) {
+    first_valid++;
+  }
+
+  if (first_valid >= samples.size()) {
+    avg_power = min_power = max_power = total_energy = 0.0;
+    return;
+  }
+
+  min_power            = samples[first_valid].device_powers_watts[device_index];
+  max_power            = samples[first_valid].device_powers_watts[device_index];
+  double power_sum     = 0.0;
+  size_t valid_samples = 0;
+  total_energy         = 0.0;
+
+  for (size_t i = first_valid; i < samples.size(); ++i) {
+    if (device_index >= samples[i].device_powers_watts.size()) continue;
+
+    const double power = samples[i].device_powers_watts[device_index];
+    if (power < 0) continue;  // Skip invalid measurements
+
+    power_sum += power;
+    valid_samples++;
+    if (power < min_power) min_power = power;
+    if (power > max_power) max_power = power;
+
+    // Energy = Power * Time. Time delta is from the previous sample.
+    if (i > first_valid) {
+      double time_delta_s = std::chrono::duration<double>(
+                                samples[i].timestamp - samples[i - 1].timestamp)
+                                .count();
+      // Use previous sample's power for energy calculation
+      if (device_index < samples[i - 1].device_powers_watts.size() &&
+          samples[i - 1].device_powers_watts[device_index] >= 0) {
+        total_energy +=
+            samples[i - 1].device_powers_watts[device_index] * time_delta_s;
+      }
+    }
+  }
+
+  avg_power = valid_samples > 0 ? power_sum / valid_samples : 0.0;
+}
+
+void export_direct_power_data_csv(const std::string& filename) {
+  std::ofstream file(filename);
+  if (!file.is_open()) {
+    std::cerr << "ERROR: Unable to open file " << filename << " for writing.\n";
+    return;
+  }
+
+  // Write header
+  file << "timestamp";
+  for (size_t i = 0; i < g_device_count; ++i) {
+    file << ",device_" << i << "_power_watts";
+  }
+  file << "\n";
+
+  // Write data
+  for (const auto& sample : g_power_samples) {
+    auto timestamp = std::chrono::duration_cast<std::chrono::milliseconds>(
+                         sample.timestamp.time_since_epoch())
+                         .count();
+    file << timestamp;
+    for (size_t i = 0; i < g_device_count; ++i) {
+      if (i < sample.device_powers_watts.size()) {
+        file << "," << sample.device_powers_watts[i];
+      } else {
+        file << ",-1";  // Invalid measurement
+      }
+    }
+    file << "\n";
+  }
+  file.close();
+  std::cout << "Direct power data exported to " << filename << std::endl;
+}
+
+// --- Kokkos Profiling Hooks ---
+
+void kokkosp_init_library(const int loadSeq, const uint64_t interfaceVer,
+                          const uint32_t devInfoCount,
+                          Kokkos_Profiling_KokkosPDeviceInfo* deviceInfo) {
+  std::cout << "Kokkos Direct Power Profiler: Initializing...\n";
+  std::cout << "Sampling Interval: " << SAMPLING_INTERVAL_MS << " ms\n";
+
+  // Initialize the timer tool
+  g_timer.init_library(loadSeq, interfaceVer, devInfoCount, deviceInfo);
+
+  g_nvml_provider = std::make_unique<NVMLProvider>();
+  if (!g_nvml_provider->initialize()) {
+    std::cerr << "ERROR: Failed to initialize NVML provider. Direct power "
+                 "profiling disabled.\n";
+    g_nvml_provider.reset();  // Release the provider
+    return;
+  }
+
+  g_device_count = g_nvml_provider->get_device_count();
+  std::cout << "SUCCESS: NVML provider initialized with " << g_device_count
+            << " device(s).\n";
+
+  // Print device information
+  for (size_t i = 0; i < g_device_count; ++i) {
+    std::cout << "  Device " << i << ": " << g_nvml_provider->get_device_name(i)
+              << std::endl;
+  }
+
+  // Start the monitoring daemon
+  g_power_daemon =
+      std::make_unique<Daemon>(power_monitoring_tick, SAMPLING_INTERVAL_MS);
+  g_start_time = std::chrono::high_resolution_clock::now();
+  g_power_daemon->start();
+  std::cout << "SUCCESS: Direct power monitoring daemon started.\n";
+}
+
+void kokkosp_finalize_library() {
+  std::cout << "\nKokkos Direct Power Profiler: Finalizing...\n";
+
+  if (g_power_daemon) {
+    g_power_daemon->stop();
+    std::cout << "SUCCESS: Direct power monitoring daemon stopped.\n";
+  }
+
+  // Finalize the timer
+  g_timer.finalize_library();
+
+  // Make a copy of the samples to avoid holding the lock during analysis
+  std::vector<DirectPowerSample> samples_copy;
+  {
+    std::lock_guard<std::mutex> lock(g_samples_mutex);
+    samples_copy = g_power_samples;
+  }
+
+  if (samples_copy.empty()) {
+    std::cout << "No direct power samples collected.\n";
+  } else {
+    auto end_time = std::chrono::high_resolution_clock::now();
+    auto total_duration_s =
+        std::chrono::duration<double>(end_time - g_start_time).count();
+
+    std::cout << "\n==== Direct Power Profile Summary ====\n";
+    std::cout << std::fixed << std::setprecision(2);
+    std::cout << "Total Monitoring Duration: " << total_duration_s << " s\n";
+    std::cout << "Samples Collected:         " << samples_copy.size() << "\n";
+    std::cout << "Number of Devices:         " << g_device_count << "\n";
+    std::cout << "---------------------------------------\n";
+
+    // Analyze each device separately
+    for (size_t dev = 0; dev < g_device_count; ++dev) {
+      double avg_power, min_power, max_power, total_energy;
+      analyze_device_power_data(samples_copy, dev, avg_power, min_power,
+                                max_power, total_energy);
+
+      std::cout << "Device " << dev << " ("
+                << g_nvml_provider->get_device_name(dev) << "):\n";
+      std::cout << "  Average Power:           " << avg_power << " W\n";
+      std::cout << "  Minimum Power:           " << min_power << " W\n";
+      std::cout << "  Maximum Power:           " << max_power << " W\n";
+      std::cout << "  Total Energy Consumed:   " << total_energy << " J\n";
+      std::cout << "---------------------------------------\n";
+    }
+
+    std::string csv_filename =
+        generate_prefix() + "_nvml_direct_power_samples.csv";
+    std::cout << "Exporting direct power data to " << csv_filename << "...\n";
+    export_direct_power_data_csv(csv_filename);
+  }
+
+  std::string prefix = generate_prefix();
+
+  const auto& kernels = g_timer.get_kernel_timings();
+  KokkosTools::Timer::print_kernels_summary(kernels);
+  KokkosTools::Timer::export_kernels_csv(kernels, prefix + "_kernels.csv");
+
+  const auto& regions = g_timer.get_region_timings();
+  KokkosTools::Timer::print_regions_summary(regions);
+  KokkosTools::Timer::export_regions_csv(regions, prefix + "_regions.csv");
+
+  const auto& deepcopies = g_timer.get_deep_copy_timings();
+  KokkosTools::Timer::print_deepcopies_summary(deepcopies);
+  KokkosTools::Timer::export_deepcopies_csv(deepcopies,
+                                            prefix + "_deepcopies.csv");
+
+  if (g_nvml_provider) {
+    g_nvml_provider->finalize();
+    std::cout << "SUCCESS: NVML provider finalized.\n";
+  }
+}
+
+// --- Hook Implementations with Timer Integration ---
+void kokkosp_begin_parallel_for(const char* name, uint32_t devID,
+                                uint64_t* kID) {
+  g_timer.begin_parallel_for(name, devID, *kID);
+}
+void kokkosp_end_parallel_for(uint64_t kID) { g_timer.end_parallel_for(kID); }
+void kokkosp_begin_parallel_scan(const char* name, uint32_t devID,
+                                 uint64_t* kID) {
+  g_timer.begin_parallel_scan(name, devID, kID);
+}
+void kokkosp_end_parallel_scan(uint64_t kID) { g_timer.end_parallel_scan(kID); }
+void kokkosp_begin_parallel_reduce(const char* name, uint32_t devID,
+                                   uint64_t* kID) {
+  g_timer.begin_parallel_reduce(name, devID, kID);
+}
+void kokkosp_end_parallel_reduce(uint64_t kID) {
+  g_timer.end_parallel_reduce(kID);
+}
+void kokkosp_push_profile_region(const char* regionName) {
+  g_timer.push_profile_region(regionName);
+}
+void kokkosp_pop_profile_region() { g_timer.pop_profile_region(); }
+void kokkosp_begin_deep_copy(Kokkos::Tools::SpaceHandle dst_handle,
+                             const char* dst_name, const void* dst_ptr,
+                             Kokkos::Tools::SpaceHandle src_handle,
+                             const char* src_name, const void* src_ptr,
+                             uint64_t size) {
+  g_timer.begin_deep_copy(dst_handle, dst_name, dst_ptr, src_handle, src_name,
+                          src_ptr, size);
+}
+void kokkosp_end_deep_copy() { g_timer.end_deep_copy(); }
+
+}  // namespace DirectPower
+}  // namespace KokkosTools
+
+extern "C" {
+
+namespace impl = KokkosTools::DirectPower;
+
+EXPOSE_INIT(impl::kokkosp_init_library)
+EXPOSE_FINALIZE(impl::kokkosp_finalize_library)
+EXPOSE_BEGIN_PARALLEL_FOR(impl::kokkosp_begin_parallel_for)
+EXPOSE_END_PARALLEL_FOR(impl::kokkosp_end_parallel_for)
+EXPOSE_BEGIN_PARALLEL_SCAN(impl::kokkosp_begin_parallel_scan)
+EXPOSE_END_PARALLEL_SCAN(impl::kokkosp_end_parallel_scan)
+EXPOSE_BEGIN_PARALLEL_REDUCE(impl::kokkosp_begin_parallel_reduce)
+EXPOSE_END_PARALLEL_REDUCE(impl::kokkosp_end_parallel_reduce)
+EXPOSE_PUSH_REGION(impl::kokkosp_push_profile_region)
+EXPOSE_POP_REGION(impl::kokkosp_pop_profile_region)
+EXPOSE_BEGIN_DEEP_COPY(impl::kokkosp_begin_deep_copy)
+EXPOSE_END_DEEP_COPY(impl::kokkosp_end_deep_copy)
+
+}  // extern "C"
diff --git a/profiling/energy-profiler/kokkos/kp_nvml_energy_consumption.cpp b/profiling/energy-profiler/kokkos/kp_nvml_energy_consumption.cpp
new file mode 100644
index 000000000..94f8bcf0e
--- /dev/null
+++ b/profiling/energy-profiler/kokkos/kp_nvml_energy_consumption.cpp
@@ -0,0 +1,533 @@
+//@HEADER
+// ************************************************************************
+//
+//                        Kokkos Energy Consumption Profiler
+//
+// Under the terms of Contract DE-NA0003525 with NTESS,
+// the U.S. Government retains certain rights in this software.
+//
+// Part of Kokkos, under the Apache License v2.0 with LLVM Exceptions.
+// See https://kokkos.org/LICENSE for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//@HEADER
+
+/**
+ * @file kp_nvml_energy_consumption.cpp
+ * @brief Kokkos Energy Consumption Profiler Tool using NVML.
+ *
+ * This tool measures energy consumption by tracking the cumulative energy
+ * values from NVML at the beginning and end of kernels, regions, and deep
+ * copies. It does not use a background daemon since the energy consumption is a
+ * cumulative counter that can be read directly when events occur.
+ */
+
+#include <iostream>
+#include <vector>
+#include <string>
+#include <chrono>
+#include <mutex>
+#include <iomanip>
+#include <cmath>
+#include <fstream>
+#include <unordered_map>
+#include <stack>
+#include <memory>
+
+#include "kp_core.hpp"
+#include "../provider/provider_nvml.hpp"
+#include "../common/filename_prefix.hpp"
+#include "../common/timer.hpp"
+#include "../tools/kernel_timer_tool.hpp"
+
+namespace KokkosTools {
+namespace EnergyConsumption {
+
+// --- Global State for the Profiler ---
+static std::unique_ptr<NVMLProvider> g_nvml_provider;
+
+// Timer tool for kernel and region timing
+static KernelTimerTool g_timer;
+
+static size_t g_device_count = 0;
+static std::chrono::high_resolution_clock::time_point g_start_time;
+
+// Energy tracking structures
+struct EnergySnapshot {
+  std::chrono::high_resolution_clock::time_point timestamp;
+  std::vector<double>
+      device_energies_joules;  // Energy for each device in Joules
+};
+
+struct KernelEnergyRecord {
+  std::string name;
+  uint32_t devID;
+  uint64_t kID;
+  EnergySnapshot start_energy;
+  EnergySnapshot end_energy;
+  double duration_seconds;
+  std::vector<double> energy_consumed_joules;  // Per device
+};
+
+struct RegionEnergyRecord {
+  std::string name;
+  EnergySnapshot start_energy;
+  EnergySnapshot end_energy;
+  double duration_seconds;
+  std::vector<double> energy_consumed_joules;  // Per device
+};
+
+struct DeepCopyEnergyRecord {
+  std::string dst_name;
+  std::string src_name;
+  uint64_t size;
+  EnergySnapshot start_energy;
+  EnergySnapshot end_energy;
+  double duration_seconds;
+  std::vector<double> energy_consumed_joules;  // Per device
+};
+
+// Storage for energy records
+static std::vector<KernelEnergyRecord> g_kernel_energy_records;
+static std::vector<RegionEnergyRecord> g_region_energy_records;
+static std::vector<DeepCopyEnergyRecord> g_deep_copy_energy_records;
+static std::mutex g_energy_mutex;
+
+// Stack for nested regions
+static std::stack<std::pair<std::string, EnergySnapshot>> g_region_stack;
+
+// Maps for tracking active kernels/deep copies
+static std::unordered_map<uint64_t, KernelEnergyRecord> g_active_kernels;
+static std::pair<bool, DeepCopyEnergyRecord> g_active_deep_copy = {false, {}};
+
+/**
+ * @brief Captures a snapshot of current energy consumption for all devices.
+ */
+EnergySnapshot capture_energy_snapshot() {
+  EnergySnapshot snapshot;
+  snapshot.timestamp = std::chrono::high_resolution_clock::now();
+  snapshot.device_energies_joules.reserve(g_device_count);
+
+  if (!g_nvml_provider || !g_nvml_provider->is_initialized()) {
+    // Fill with invalid values
+    for (size_t i = 0; i < g_device_count; ++i) {
+      snapshot.device_energies_joules.push_back(-1.0);
+    }
+    return snapshot;
+  }
+
+  // Collect energy for each device
+  for (size_t i = 0; i < g_device_count; ++i) {
+    double energy = g_nvml_provider->get_current_energy_consumption(i);
+    snapshot.device_energies_joules.push_back(energy);
+  }
+
+  return snapshot;
+}
+
+/**
+ * @brief Calculates energy consumed between two snapshots.
+ */
+std::vector<double> calculate_energy_delta(const EnergySnapshot& start,
+                                           const EnergySnapshot& end) {
+  std::vector<double> delta(g_device_count, 0.0);
+
+  for (size_t i = 0; i < g_device_count; ++i) {
+    if (i < start.device_energies_joules.size() &&
+        i < end.device_energies_joules.size() &&
+        start.device_energies_joules[i] >= 0 &&
+        end.device_energies_joules[i] >= 0) {
+      delta[i] =
+          end.device_energies_joules[i] - start.device_energies_joules[i];
+      // Handle potential counter reset (though rare)
+      if (delta[i] < 0) {
+        delta[i] = 0;  // Reset occurred, use 0 as approximation
+      }
+    } else {
+      delta[i] = -1.0;  // Invalid measurement
+    }
+  }
+
+  return delta;
+}
+
+/**
+ * @brief Calculates duration in seconds between two snapshots.
+ */
+double calculate_duration_seconds(const EnergySnapshot& start,
+                                  const EnergySnapshot& end) {
+  return std::chrono::duration<double>(end.timestamp - start.timestamp).count();
+}
+
+void export_energy_consumption_csv(const std::string& filename) {
+  std::ofstream file(filename);
+  if (!file.is_open()) {
+    std::cerr << "ERROR: Unable to open file " << filename << " for writing.\n";
+    return;
+  }
+
+  // Write kernels
+  file << "type,name,duration_seconds";
+  for (size_t i = 0; i < g_device_count; ++i) {
+    file << ",device_" << i << "_energy_joules";
+  }
+  file << "\n";
+
+  for (const auto& record : g_kernel_energy_records) {
+    file << "kernel," << record.name << "," << record.duration_seconds;
+    for (size_t i = 0; i < g_device_count; ++i) {
+      if (i < record.energy_consumed_joules.size()) {
+        file << "," << record.energy_consumed_joules[i];
+      } else {
+        file << ",-1";
+      }
+    }
+    file << "\n";
+  }
+
+  for (const auto& record : g_region_energy_records) {
+    file << "region," << record.name << "," << record.duration_seconds;
+    for (size_t i = 0; i < g_device_count; ++i) {
+      if (i < record.energy_consumed_joules.size()) {
+        file << "," << record.energy_consumed_joules[i];
+      } else {
+        file << ",-1";
+      }
+    }
+    file << "\n";
+  }
+
+  for (const auto& record : g_deep_copy_energy_records) {
+    std::string name = record.src_name + "_to_" + record.dst_name + "_size_" +
+                       std::to_string(record.size);
+    file << "deepcopy," << name << "," << record.duration_seconds;
+    for (size_t i = 0; i < g_device_count; ++i) {
+      if (i < record.energy_consumed_joules.size()) {
+        file << "," << record.energy_consumed_joules[i];
+      } else {
+        file << ",-1";
+      }
+    }
+    file << "\n";
+  }
+
+  file.close();
+  std::cout << "Energy consumption data exported to " << filename << std::endl;
+}
+
+void print_energy_summary() {
+  std::cout << "\n==== Energy Consumption Profile Summary ====\n";
+  std::cout << std::fixed << std::setprecision(4);
+
+  // Calculate total energy per device
+  std::vector<double> total_kernel_energy(g_device_count, 0.0);
+  std::vector<double> total_region_energy(g_device_count, 0.0);
+  std::vector<double> total_deepcopy_energy(g_device_count, 0.0);
+
+  for (const auto& record : g_kernel_energy_records) {
+    for (size_t i = 0;
+         i < g_device_count && i < record.energy_consumed_joules.size(); ++i) {
+      if (record.energy_consumed_joules[i] >= 0) {
+        total_kernel_energy[i] += record.energy_consumed_joules[i];
+      }
+    }
+  }
+
+  for (const auto& record : g_region_energy_records) {
+    for (size_t i = 0;
+         i < g_device_count && i < record.energy_consumed_joules.size(); ++i) {
+      if (record.energy_consumed_joules[i] >= 0) {
+        total_region_energy[i] += record.energy_consumed_joules[i];
+      }
+    }
+  }
+
+  for (const auto& record : g_deep_copy_energy_records) {
+    for (size_t i = 0;
+         i < g_device_count && i < record.energy_consumed_joules.size(); ++i) {
+      if (record.energy_consumed_joules[i] >= 0) {
+        total_deepcopy_energy[i] += record.energy_consumed_joules[i];
+      }
+    }
+  }
+
+  std::cout << "Number of Kernels:         " << g_kernel_energy_records.size()
+            << "\n";
+  std::cout << "Number of Regions:         " << g_region_energy_records.size()
+            << "\n";
+  std::cout << "Number of Deep Copies:     "
+            << g_deep_copy_energy_records.size() << "\n";
+  std::cout << "Number of Devices:         " << g_device_count << "\n";
+  std::cout << "--------------------------------------------\n";
+
+  for (size_t dev = 0; dev < g_device_count; ++dev) {
+    std::cout << "Device " << dev << " ("
+              << g_nvml_provider->get_device_name(dev) << "):\n";
+    std::cout << "  Total Kernel Energy:       " << total_kernel_energy[dev]
+              << " J\n";
+    std::cout << "  Total Region Energy:       " << total_region_energy[dev]
+              << " J\n";
+    std::cout << "  Total Deep Copy Energy:    " << total_deepcopy_energy[dev]
+              << " J\n";
+    std::cout << "  Total Energy:              "
+              << (total_kernel_energy[dev] + total_region_energy[dev] +
+                  total_deepcopy_energy[dev])
+              << " J\n";
+    std::cout << "--------------------------------------------\n";
+  }
+}
+
+// --- Kokkos Profiling Hooks ---
+
+void kokkosp_init_library(const int loadSeq, const uint64_t interfaceVer,
+                          const uint32_t devInfoCount,
+                          Kokkos_Profiling_KokkosPDeviceInfo* deviceInfo) {
+  std::cout << "Kokkos Energy Consumption Profiler: Initializing...\n";
+
+  // Initialize the timer tool
+  g_timer.init_library(loadSeq, interfaceVer, devInfoCount, deviceInfo);
+
+  g_nvml_provider = std::make_unique<NVMLProvider>();
+  if (!g_nvml_provider->initialize()) {
+    std::cerr << "ERROR: Failed to initialize NVML provider. Energy "
+                 "consumption profiling disabled.\n";
+    g_nvml_provider.reset();  // Release the provider
+    return;
+  }
+
+  g_device_count = g_nvml_provider->get_device_count();
+  std::cout << "SUCCESS: NVML provider initialized with " << g_device_count
+            << " device(s).\n";
+
+  // Print device information
+  for (size_t i = 0; i < g_device_count; ++i) {
+    std::cout << "  Device " << i << ": " << g_nvml_provider->get_device_name(i)
+              << std::endl;
+  }
+
+  g_start_time = std::chrono::high_resolution_clock::now();
+  std::cout << "SUCCESS: Energy consumption monitoring initialized.\n";
+}
+
+void kokkosp_finalize_library() {
+  std::cout << "\nKokkos Energy Consumption Profiler: Finalizing...\n";
+
+  // Finalize the timer
+  g_timer.finalize_library();
+
+  auto end_time = std::chrono::high_resolution_clock::now();
+  auto total_duration_s =
+      std::chrono::duration<double>(end_time - g_start_time).count();
+
+  std::cout << "Total Monitoring Duration: " << total_duration_s << " s\n";
+
+  print_energy_summary();
+
+  std::string prefix = generate_prefix();
+
+  // Export energy data
+  std::string csv_filename = prefix + "_nvml_energy_consumption.csv";
+  std::cout << "Exporting energy consumption data to " << csv_filename
+            << "...\n";
+  export_energy_consumption_csv(csv_filename);
+
+  // Export timing data
+  const auto& kernels = g_timer.get_kernel_timings();
+  KokkosTools::Timer::print_kernels_summary(kernels);
+  KokkosTools::Timer::export_kernels_csv(kernels, prefix + "_kernels.csv");
+
+  const auto& regions = g_timer.get_region_timings();
+  KokkosTools::Timer::print_regions_summary(regions);
+  KokkosTools::Timer::export_regions_csv(regions, prefix + "_regions.csv");
+
+  const auto& deepcopies = g_timer.get_deep_copy_timings();
+  KokkosTools::Timer::print_deepcopies_summary(deepcopies);
+  KokkosTools::Timer::export_deepcopies_csv(deepcopies,
+                                            prefix + "_deepcopies.csv");
+
+  if (g_nvml_provider) {
+    g_nvml_provider->finalize();
+    std::cout << "SUCCESS: NVML provider finalized.\n";
+  }
+}
+
+// --- Hook Implementations with Timer and Energy Integration ---
+void kokkosp_begin_parallel_for(const char* name, uint32_t devID,
+                                uint64_t* kID) {
+  g_timer.begin_parallel_for(name, devID, *kID);
+
+  // Capture energy snapshot
+  KernelEnergyRecord record;
+  record.name         = name;
+  record.devID        = devID;
+  record.kID          = *kID;
+  record.start_energy = capture_energy_snapshot();
+
+  std::lock_guard<std::mutex> lock(g_energy_mutex);
+  g_active_kernels[*kID] = record;
+}
+
+void kokkosp_end_parallel_for(uint64_t kID) {
+  g_timer.end_parallel_for(kID);
+
+  std::lock_guard<std::mutex> lock(g_energy_mutex);
+  auto it = g_active_kernels.find(kID);
+  if (it != g_active_kernels.end()) {
+    it->second.end_energy       = capture_energy_snapshot();
+    it->second.duration_seconds = calculate_duration_seconds(
+        it->second.start_energy, it->second.end_energy);
+    it->second.energy_consumed_joules =
+        calculate_energy_delta(it->second.start_energy, it->second.end_energy);
+
+    g_kernel_energy_records.push_back(it->second);
+    g_active_kernels.erase(it);
+  }
+}
+
+void kokkosp_begin_parallel_scan(const char* name, uint32_t devID,
+                                 uint64_t* kID) {
+  g_timer.begin_parallel_scan(name, devID, kID);
+
+  KernelEnergyRecord record;
+  record.name         = name;
+  record.devID        = devID;
+  record.kID          = *kID;
+  record.start_energy = capture_energy_snapshot();
+
+  std::lock_guard<std::mutex> lock(g_energy_mutex);
+  g_active_kernels[*kID] = record;
+}
+
+void kokkosp_end_parallel_scan(uint64_t kID) {
+  g_timer.end_parallel_scan(kID);
+
+  std::lock_guard<std::mutex> lock(g_energy_mutex);
+  auto it = g_active_kernels.find(kID);
+  if (it != g_active_kernels.end()) {
+    it->second.end_energy       = capture_energy_snapshot();
+    it->second.duration_seconds = calculate_duration_seconds(
+        it->second.start_energy, it->second.end_energy);
+    it->second.energy_consumed_joules =
+        calculate_energy_delta(it->second.start_energy, it->second.end_energy);
+
+    g_kernel_energy_records.push_back(it->second);
+    g_active_kernels.erase(it);
+  }
+}
+
+void kokkosp_begin_parallel_reduce(const char* name, uint32_t devID,
+                                   uint64_t* kID) {
+  g_timer.begin_parallel_reduce(name, devID, kID);
+
+  KernelEnergyRecord record;
+  record.name         = name;
+  record.devID        = devID;
+  record.kID          = *kID;
+  record.start_energy = capture_energy_snapshot();
+
+  std::lock_guard<std::mutex> lock(g_energy_mutex);
+  g_active_kernels[*kID] = record;
+}
+
+void kokkosp_end_parallel_reduce(uint64_t kID) {
+  g_timer.end_parallel_reduce(kID);
+
+  std::lock_guard<std::mutex> lock(g_energy_mutex);
+  auto it = g_active_kernels.find(kID);
+  if (it != g_active_kernels.end()) {
+    it->second.end_energy       = capture_energy_snapshot();
+    it->second.duration_seconds = calculate_duration_seconds(
+        it->second.start_energy, it->second.end_energy);
+    it->second.energy_consumed_joules =
+        calculate_energy_delta(it->second.start_energy, it->second.end_energy);
+
+    g_kernel_energy_records.push_back(it->second);
+    g_active_kernels.erase(it);
+  }
+}
+
+void kokkosp_push_profile_region(const char* regionName) {
+  g_timer.push_profile_region(regionName);
+
+  EnergySnapshot snapshot = capture_energy_snapshot();
+  g_region_stack.push({std::string(regionName), snapshot});
+}
+
+void kokkosp_pop_profile_region() {
+  g_timer.pop_profile_region();
+
+  if (!g_region_stack.empty()) {
+    auto [name, start_energy] = g_region_stack.top();
+    g_region_stack.pop();
+
+    RegionEnergyRecord record;
+    record.name         = name;
+    record.start_energy = start_energy;
+    record.end_energy   = capture_energy_snapshot();
+    record.duration_seconds =
+        calculate_duration_seconds(record.start_energy, record.end_energy);
+    record.energy_consumed_joules =
+        calculate_energy_delta(record.start_energy, record.end_energy);
+
+    std::lock_guard<std::mutex> lock(g_energy_mutex);
+    g_region_energy_records.push_back(record);
+  }
+}
+
+void kokkosp_begin_deep_copy(Kokkos::Tools::SpaceHandle dst_handle,
+                             const char* dst_name, const void* dst_ptr,
+                             Kokkos::Tools::SpaceHandle src_handle,
+                             const char* src_name, const void* src_ptr,
+                             uint64_t size) {
+  g_timer.begin_deep_copy(dst_handle, dst_name, dst_ptr, src_handle, src_name,
+                          src_ptr, size);
+
+  std::lock_guard<std::mutex> lock(g_energy_mutex);
+  if (!g_active_deep_copy.first) {
+    g_active_deep_copy.second.dst_name     = dst_name ? dst_name : "unknown";
+    g_active_deep_copy.second.src_name     = src_name ? src_name : "unknown";
+    g_active_deep_copy.second.size         = size;
+    g_active_deep_copy.second.start_energy = capture_energy_snapshot();
+    g_active_deep_copy.first               = true;
+  }
+}
+
+void kokkosp_end_deep_copy() {
+  g_timer.end_deep_copy();
+
+  std::lock_guard<std::mutex> lock(g_energy_mutex);
+  if (g_active_deep_copy.first) {
+    g_active_deep_copy.second.end_energy = capture_energy_snapshot();
+    g_active_deep_copy.second.duration_seconds =
+        calculate_duration_seconds(g_active_deep_copy.second.start_energy,
+                                   g_active_deep_copy.second.end_energy);
+    g_active_deep_copy.second.energy_consumed_joules =
+        calculate_energy_delta(g_active_deep_copy.second.start_energy,
+                               g_active_deep_copy.second.end_energy);
+
+    g_deep_copy_energy_records.push_back(g_active_deep_copy.second);
+    g_active_deep_copy.first = false;
+  }
+}
+
+}  // namespace EnergyConsumption
+}  // namespace KokkosTools
+
+extern "C" {
+
+namespace impl = KokkosTools::EnergyConsumption;
+
+EXPOSE_INIT(impl::kokkosp_init_library)
+EXPOSE_FINALIZE(impl::kokkosp_finalize_library)
+EXPOSE_BEGIN_PARALLEL_FOR(impl::kokkosp_begin_parallel_for)
+EXPOSE_END_PARALLEL_FOR(impl::kokkosp_end_parallel_for)
+EXPOSE_BEGIN_PARALLEL_SCAN(impl::kokkosp_begin_parallel_scan)
+EXPOSE_END_PARALLEL_SCAN(impl::kokkosp_end_parallel_scan)
+EXPOSE_BEGIN_PARALLEL_REDUCE(impl::kokkosp_begin_parallel_reduce)
+EXPOSE_END_PARALLEL_REDUCE(impl::kokkosp_end_parallel_reduce)
+EXPOSE_PUSH_REGION(impl::kokkosp_push_profile_region)
+EXPOSE_POP_REGION(impl::kokkosp_pop_profile_region)
+EXPOSE_BEGIN_DEEP_COPY(impl::kokkosp_begin_deep_copy)
+EXPOSE_END_DEEP_COPY(impl::kokkosp_end_deep_copy)
+
+}  // extern "C"
diff --git a/profiling/energy-profiler/kokkos/kp_nvml_power.cpp b/profiling/energy-profiler/kokkos/kp_nvml_power.cpp
new file mode 100644
index 000000000..cedbfa157
--- /dev/null
+++ b/profiling/energy-profiler/kokkos/kp_nvml_power.cpp
@@ -0,0 +1,288 @@
+//@HEADER
+// ************************************************************************
+//
+//                        Kokkos Power Profiler
+//
+// Under the terms of Contract DE-NA0003525 with NTESS,
+// the U.S. Government retains certain rights in this software.
+//
+// Part of Kokkos, under the Apache License v2.0 with LLVM Exceptions.
+// See https://kokkos.org/LICENSE for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//@HEADER
+
+/**
+ * @file kp_nvml_power_tool.cpp
+ * @brief Kokkos Power Profiler Tool using NVML.
+ *
+ * This tool leverages a background daemon to periodically sample GPU power
+ * consumption using the NVML library. It starts monitoring when the Kokkos
+ * library is initialized and prints a detailed power profile upon finalization.
+ */
+
+#include <iostream>
+#include <vector>
+#include <string>
+#include <chrono>
+#include <mutex>
+#include <iomanip>
+#include <cmath>
+#include <fstream>
+
+#include "kp_core.hpp"
+#include "../common/daemon.hpp"
+#include "../provider/provider_nvml.hpp"
+#include "../common/filename_prefix.hpp"
+#include "../common/timer.hpp"
+#include "../tools/kernel_timer_tool.hpp"
+
+namespace KokkosTools {
+namespace Power {
+
+// --- Configuration ---
+// The interval in milliseconds for power sampling.
+constexpr int SAMPLING_INTERVAL_MS = 20;
+
+// --- Global State for the Profiler ---
+static std::unique_ptr<Daemon> g_power_daemon;
+static std::unique_ptr<NVMLProvider> g_nvml_provider;
+
+// Timer tool for kernel and region timing
+static KernelTimerTool g_timer;
+
+// Structure to store a single power measurement with a timestamp.
+struct PowerSample {
+  std::chrono::high_resolution_clock::time_point timestamp;
+  double power_watts;
+};
+
+// Thread-safe storage for collected power samples.
+static std::vector<PowerSample> g_power_samples;
+static std::mutex g_samples_mutex;
+static std::chrono::high_resolution_clock::time_point g_start_time;
+
+/**
+ * @brief The function executed by the daemon thread to sample power.
+ *
+ * This function is called periodically. It fetches the current total power
+ * usage from the NVML provider and stores it with a timestamp.
+ */
+void power_monitoring_tick() {
+  if (!g_nvml_provider || !g_nvml_provider->is_initialized()) {
+    return;
+  }
+
+  double current_power = g_nvml_provider->get_total_power_usage();
+
+  std::lock_guard<std::mutex> lock(g_samples_mutex);
+  g_power_samples.push_back(
+      {std::chrono::high_resolution_clock::now(), current_power});
+}
+
+/**
+ * @brief Calculates statistics from the collected power samples.
+ *
+ * @param samples A constant reference to the vector of power samples.
+ * @param[out] avg_power Average power consumption.
+ * @param[out] min_power Minimum power consumption.
+ * @param[out] max_power Maximum power consumption.
+ * @param[out] total_energy Total energy consumed in Joules.
+ */
+void analyze_power_data(const std::vector<PowerSample>& samples,
+                        double& avg_power, double& min_power, double& max_power,
+                        double& total_energy) {
+  if (samples.empty()) {
+    avg_power = min_power = max_power = total_energy = 0.0;
+    return;
+  }
+
+  min_power        = samples[0].power_watts;
+  max_power        = samples[0].power_watts;
+  double power_sum = 0.0;
+  total_energy     = 0.0;
+
+  for (size_t i = 0; i < samples.size(); ++i) {
+    const double power = samples[i].power_watts;
+    power_sum += power;
+    if (power < min_power) min_power = power;
+    if (power > max_power) max_power = power;
+
+    // Energy = Power * Time. Time delta is from the previous sample.
+    if (i > 0) {
+      double time_delta_s = std::chrono::duration<double>(
+                                samples[i].timestamp - samples[i - 1].timestamp)
+                                .count();
+      total_energy += samples[i - 1].power_watts * time_delta_s;
+    }
+  }
+
+  avg_power = power_sum / samples.size();
+}
+
+void export_power_data_csv(const std::string& filename) {
+  std::ofstream file(filename);
+  if (!file.is_open()) {
+    std::cerr << "ERROR: Unable to open file " << filename << " for writing.\n";
+    return;
+  }
+  file << "timestamp,power_watts\n";
+  for (const auto& sample : g_power_samples) {
+    auto timestamp = std::chrono::duration_cast<std::chrono::milliseconds>(
+                         sample.timestamp.time_since_epoch())
+                         .count();
+    file << timestamp << "," << sample.power_watts << "\n";
+  }
+  file.close();
+  std::cout << "Power data exported to " << filename << std::endl;
+}
+
+// --- Kokkos Profiling Hooks ---
+
+void kokkosp_init_library(const int loadSeq, const uint64_t interfaceVer,
+                          const uint32_t devInfoCount,
+                          Kokkos_Profiling_KokkosPDeviceInfo* deviceInfo) {
+  std::cout << "Kokkos Power Profiler: Initializing...\n";
+  std::cout << "Sampling Interval: " << SAMPLING_INTERVAL_MS << " ms\n";
+
+  // Initialize the timer tool
+  g_timer.init_library(loadSeq, interfaceVer, devInfoCount, deviceInfo);
+
+  g_nvml_provider = std::make_unique<NVMLProvider>();
+  if (!g_nvml_provider->initialize()) {
+    std::cerr << "ERROR: Failed to initialize NVML provider. Power profiling "
+                 "disabled.\n";
+    g_nvml_provider.reset();  // Release the provider
+    return;
+  }
+
+  std::cout << "SUCCESS: NVML provider initialized with "
+            << g_nvml_provider->get_device_count() << " device(s).\n";
+
+  // Start the monitoring daemon
+  g_power_daemon =
+      std::make_unique<Daemon>(power_monitoring_tick, SAMPLING_INTERVAL_MS);
+  g_start_time = std::chrono::high_resolution_clock::now();
+  g_power_daemon->start();
+  std::cout << "SUCCESS: Power monitoring daemon started.\n";
+}
+
+void kokkosp_finalize_library() {
+  std::cout << "\nKokkos Power Profiler: Finalizing...\n";
+
+  if (g_power_daemon) {
+    g_power_daemon->stop();
+    std::cout << "SUCCESS: Power monitoring daemon stopped.\n";
+  }
+
+  // Finalize the timer
+  g_timer.finalize_library();
+
+  // Make a copy of the samples to avoid holding the lock during analysis
+  std::vector<PowerSample> samples_copy;
+  {
+    std::lock_guard<std::mutex> lock(g_samples_mutex);
+    samples_copy = g_power_samples;
+  }
+
+  if (samples_copy.empty()) {
+    std::cout << "No power samples collected.\n";
+  } else {
+    auto end_time = std::chrono::high_resolution_clock::now();
+    auto total_duration_s =
+        std::chrono::duration<double>(end_time - g_start_time).count();
+
+    double avg_power, min_power, max_power, total_energy;
+    analyze_power_data(samples_copy, avg_power, min_power, max_power,
+                       total_energy);
+
+    std::cout << "\n==== Power Profile Summary ====\n";
+    std::cout << std::fixed << std::setprecision(2);
+    std::cout << "Total Monitoring Duration: " << total_duration_s << " s\n";
+    std::cout << "Samples Collected:         " << samples_copy.size() << "\n";
+    std::cout << "---------------------------------\n";
+    std::cout << "Average Power:             " << avg_power << " W\n";
+    std::cout << "Minimum Power:             " << min_power << " W\n";
+    std::cout << "Maximum Power:             " << max_power << " W\n";
+    std::cout << "Total Energy Consumed:     " << total_energy << " J\n";
+    std::cout << "===============================\n";
+
+    std::string csv_filename = generate_prefix() + "_nvml_power_samples.csv";
+    std::cout << "Exporting power data to " << csv_filename << "...\n";
+    export_power_data_csv(csv_filename);
+  }
+
+  std::string prefix = generate_prefix();
+
+  const auto& kernels = g_timer.get_kernel_timings();
+  KokkosTools::Timer::print_kernels_summary(kernels);
+  KokkosTools::Timer::export_kernels_csv(kernels, prefix + "_kernels.csv");
+
+  const auto& regions = g_timer.get_region_timings();
+  KokkosTools::Timer::print_regions_summary(regions);
+  KokkosTools::Timer::export_regions_csv(regions, prefix + "_regions.csv");
+
+  const auto& deepcopies = g_timer.get_deep_copy_timings();
+  KokkosTools::Timer::print_deepcopies_summary(deepcopies);
+  KokkosTools::Timer::export_deepcopies_csv(deepcopies,
+                                            prefix + "_deepcopies.csv");
+
+  if (g_nvml_provider) {
+    g_nvml_provider->finalize();
+    std::cout << "SUCCESS: NVML provider finalized.\n";
+  }
+}
+
+// --- Hook Implementations with Timer Integration ---
+void kokkosp_begin_parallel_for(const char* name, uint32_t devID,
+                                uint64_t* kID) {
+  g_timer.begin_parallel_for(name, devID, *kID);
+}
+void kokkosp_end_parallel_for(uint64_t kID) { g_timer.end_parallel_for(kID); }
+void kokkosp_begin_parallel_scan(const char* name, uint32_t devID,
+                                 uint64_t* kID) {
+  g_timer.begin_parallel_scan(name, devID, kID);
+}
+void kokkosp_end_parallel_scan(uint64_t kID) { g_timer.end_parallel_scan(kID); }
+void kokkosp_begin_parallel_reduce(const char* name, uint32_t devID,
+                                   uint64_t* kID) {
+  g_timer.begin_parallel_reduce(name, devID, kID);
+}
+void kokkosp_end_parallel_reduce(uint64_t kID) {
+  g_timer.end_parallel_reduce(kID);
+}
+void kokkosp_push_profile_region(const char* regionName) {
+  g_timer.push_profile_region(regionName);
+}
+void kokkosp_pop_profile_region() { g_timer.pop_profile_region(); }
+void kokkosp_begin_deep_copy(Kokkos::Tools::SpaceHandle dst_handle,
+                             const char* dst_name, const void* dst_ptr,
+                             Kokkos::Tools::SpaceHandle src_handle,
+                             const char* src_name, const void* src_ptr,
+                             uint64_t size) {
+  g_timer.begin_deep_copy(dst_handle, dst_name, dst_ptr, src_handle, src_name,
+                          src_ptr, size);
+}
+void kokkosp_end_deep_copy() { g_timer.end_deep_copy(); }
+
+}  // namespace Power
+}  // namespace KokkosTools
+
+extern "C" {
+
+namespace impl = KokkosTools::Power;
+
+EXPOSE_INIT(impl::kokkosp_init_library)
+EXPOSE_FINALIZE(impl::kokkosp_finalize_library)
+EXPOSE_BEGIN_PARALLEL_FOR(impl::kokkosp_begin_parallel_for)
+EXPOSE_END_PARALLEL_FOR(impl::kokkosp_end_parallel_for)
+EXPOSE_BEGIN_PARALLEL_SCAN(impl::kokkosp_begin_parallel_scan)
+EXPOSE_END_PARALLEL_SCAN(impl::kokkosp_end_parallel_scan)
+EXPOSE_BEGIN_PARALLEL_REDUCE(impl::kokkosp_begin_parallel_reduce)
+EXPOSE_END_PARALLEL_REDUCE(impl::kokkosp_end_parallel_reduce)
+EXPOSE_PUSH_REGION(impl::kokkosp_push_profile_region)
+EXPOSE_POP_REGION(impl::kokkosp_pop_profile_region)
+EXPOSE_BEGIN_DEEP_COPY(impl::kokkosp_begin_deep_copy)
+EXPOSE_END_DEEP_COPY(impl::kokkosp_end_deep_copy)
+
+}  // extern "C"
diff --git a/profiling/energy-profiler/kokkos/kp_variorum_power.cpp b/profiling/energy-profiler/kokkos/kp_variorum_power.cpp
new file mode 100644
index 000000000..8e5e47b99
--- /dev/null
+++ b/profiling/energy-profiler/kokkos/kp_variorum_power.cpp
@@ -0,0 +1,299 @@
+//@HEADER
+// ************************************************************************
+//
+//                        Kokkos v. 4.0
+//       Copyright (2022) National Technology & Engineering
+//               Solutions of Sandia, LLC (NTESS).
+//
+// Under the terms of Contract DE-NA0003525 with NTESS,
+// the U.S. Government retains certain rights in this software.
+//
+// Part of Kokkos, under the Apache License v2.0 with LLVM Exceptions.
+// See https://kokkos.org/LICENSE for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//@HEADER
+
+/**
+ * @file kp_variorum_power_tool.cpp
+ * @brief Kokkos Power Profiler Tool using Variorum.
+ *
+ * This tool leverages a background daemon to periodically sample GPU power
+ * consumption using the Variorum library via a provider interface. It starts
+ * monitoring when the Kokkos library is initialized and writes detailed
+ * power profiles to CSV files upon finalization.
+ */
+
+#include <iostream>
+#include <vector>
+#include <string>
+#include <chrono>
+#include <thread>
+#include <atomic>
+#include <mutex>
+#include <numeric>
+#include <algorithm>
+#include <iomanip>
+#include <cmath>
+#include <memory>
+#include <cstdio>
+#include <cstdlib>
+#include <unistd.h>
+#include <inttypes.h>
+#include <fstream>
+#include <deque>
+
+#include "kp_core.hpp"
+#include "../provider/provider_variorum.hpp"
+#include "../common/daemon.hpp"
+#include "../common/filename_prefix.hpp"
+#include "../common/timer.hpp"
+#include "../tools/kernel_timer_tool.hpp"
+
+namespace KokkosTools {
+namespace VariorumPower {
+
+KernelTimerTool timer;
+
+// --- Data Structures for Self-Contained Management ---
+
+struct PowerDataPoint {
+  int64_t timestamp_ns;
+  double power_watts;
+};
+
+// --- Global State for the Profiler ---
+static std::unique_ptr<Daemon> g_power_daemon;
+static std::unique_ptr<VariorumProvider> g_variorum_provider;
+static std::mutex g_data_mutex;  // Mutex for all data collections
+static std::chrono::high_resolution_clock::time_point g_start_time;
+
+// Data Collections
+static std::vector<PowerDataPoint> g_power_data;
+
+// --- Helper Functions ---
+
+// Get current time in nanoseconds since epoch
+int64_t get_current_epoch_ns() {
+  return std::chrono::duration_cast<std::chrono::nanoseconds>(
+             std::chrono::high_resolution_clock::now().time_since_epoch())
+      .count();
+}
+
+void write_power_data_to_csv(const std::string& filename) {
+  std::ofstream outfile(filename);
+  if (!outfile.is_open()) {
+    std::cerr << "KokkosP Variorum Power: Could not open file for writing: "
+              << filename << "\n";
+    return;
+  }
+
+  outfile << "timestamp_nanoseconds,power_watts\n";
+  std::lock_guard<std::mutex> lock(g_data_mutex);
+  for (const auto& point : g_power_data) {
+    outfile << point.timestamp_ns << "," << std::fixed << std::setprecision(3)
+            << point.power_watts << "\n";
+  }
+  printf("KokkosP Variorum Power: Wrote power data to %s\n", filename.c_str());
+}
+
+// --- Monitoring Function (for Daemon) ---
+
+void variorum_power_monitoring_tick() {
+  if (!g_variorum_provider || !g_variorum_provider->is_initialized()) {
+    return;
+  }
+
+  double current_power_W = g_variorum_provider->get_total_power_usage();
+  int64_t timestamp_ns   = get_current_epoch_ns();
+
+  std::lock_guard<std::mutex> lock(g_data_mutex);
+  g_power_data.push_back({timestamp_ns, current_power_W});
+}
+
+// --- Kokkos Profiling Hooks ---
+
+void kokkosp_init_library(const int loadSeq, const uint64_t interfaceVer,
+                          const uint32_t devInfoCount,
+                          Kokkos_Profiling_KokkosPDeviceInfo* deviceInfo) {
+  printf(
+      "======================================================================"
+      "\n");
+  printf("KokkosP: Variorum Power Profiler Initialized\n");
+  printf("KokkosP: Sequence: %d, Interface Version: %llu, Devices: %u\n",
+         loadSeq, (unsigned long long)interfaceVer, devInfoCount);
+  printf(
+      "======================================================================"
+      "\n");
+
+  g_start_time = std::chrono::high_resolution_clock::now();
+
+  g_variorum_provider = std::make_unique<VariorumProvider>();
+  if (!g_variorum_provider->initialize()) {
+    std::cerr << "KokkosP Variorum Power: Failed to initialize Variorum, power "
+                 "monitoring disabled\n";
+    g_variorum_provider.reset();
+    return;
+  }
+
+  int interval_ms = 20;
+  if (const char* interval_env =
+          std::getenv("KOKKOS_VARIORUM_POWER_INTERVAL")) {
+    try {
+      interval_ms = std::stoi(interval_env);
+      if (interval_ms <= 0) {
+        interval_ms = 20;
+        throw std::invalid_argument("Interval must be positive");
+      }
+      printf("KokkosP Variorum Power: Using custom interval: %d ms\n",
+             interval_ms);
+    } catch (const std::exception& e) {
+      printf(
+          "KokkosP Variorum Power: Invalid interval value, using default "
+          "20ms\n");
+    }
+  } else {
+    printf("KokkosP Variorum Power: Using default interval: 20 ms\n");
+  }
+
+  g_power_daemon = std::make_unique<Daemon>(
+      std::function<void()>(variorum_power_monitoring_tick), interval_ms);
+  g_power_daemon->start();
+  printf("KokkosP Variorum Power: Power monitoring started\n");
+
+  timer.init_library(loadSeq, interfaceVer, devInfoCount, deviceInfo);
+}
+
+void kokkosp_finalize_library() {
+  auto end_time = std::chrono::high_resolution_clock::now();
+
+  printf(
+      "======================================================================"
+      "\n");
+  printf("KokkosP: Variorum Power Profiler Finalization\n");
+
+  if (g_power_daemon && g_power_daemon->is_running()) {
+    g_power_daemon->stop();
+    printf("KokkosP Variorum Power: Power monitoring stopped\n");
+  }
+
+  auto duration = std::chrono::duration_cast<std::chrono::milliseconds>(
+      end_time - g_start_time);
+  double elapsed_seconds = duration.count() / 1000.0;
+
+  printf("KokkosP Variorum Power: Total execution time: %.3f seconds\n",
+         elapsed_seconds);
+
+  auto power_filename = generate_prefix() + "_variorum_power_samples.csv";
+  write_power_data_to_csv(power_filename);
+
+  if (g_variorum_provider) {
+    g_variorum_provider->finalize();
+  }
+  printf(
+      "======================================================================"
+      "\n");
+
+  timer.finalize_library();
+
+  std::string prefix = generate_prefix();
+
+  const auto& kernels = timer.get_kernel_timings();
+  KokkosTools::Timer::print_kernels_summary(kernels);
+  KokkosTools::Timer::export_kernels_csv(kernels, prefix + "_kernels.csv");
+
+  const auto& regions = timer.get_region_timings();
+  KokkosTools::Timer::print_regions_summary(regions);
+  KokkosTools::Timer::export_regions_csv(regions, prefix + "_regions.csv");
+
+  const auto& deepcopies = timer.get_deep_copy_timings();
+  KokkosTools::Timer::print_deepcopies_summary(deepcopies);
+  KokkosTools::Timer::export_deepcopies_csv(deepcopies,
+                                            prefix + "_deepcopies.csv");
+}
+
+void kokkosp_begin_parallel_for(const char* name, const uint32_t devID,
+                                uint64_t* kID) {
+  timer.begin_parallel_for(name, devID, *kID);
+}
+
+void kokkosp_end_parallel_for(const uint64_t kID) {
+  timer.end_parallel_for(kID);
+}
+
+void kokkosp_begin_parallel_scan(const char* name, const uint32_t devID,
+                                 uint64_t* kID) {
+  timer.begin_parallel_scan(name, devID, kID);
+}
+
+void kokkosp_end_parallel_scan(const uint64_t kID) {
+  timer.end_parallel_scan(kID);
+}
+
+void kokkosp_begin_parallel_reduce(const char* name, const uint32_t devID,
+                                   uint64_t* kID) {
+  timer.begin_parallel_reduce(name, devID, kID);
+}
+
+void kokkosp_end_parallel_reduce(const uint64_t kID) {
+  timer.end_parallel_reduce(kID);
+}
+
+void kokkosp_push_profile_region(char const* regionName) {
+  timer.push_profile_region(regionName);
+}
+
+void kokkosp_pop_profile_region() { timer.pop_profile_region(); }
+
+void kokkosp_begin_deep_copy(Kokkos::Tools::SpaceHandle dst_handle,
+                             const char* dst_name, const void* dst_ptr,
+                             Kokkos::Tools::SpaceHandle src_handle,
+                             const char* src_name, const void* src_ptr,
+                             uint64_t size) {
+  timer.begin_deep_copy(dst_handle, dst_name, dst_ptr, src_handle, src_name,
+                        src_ptr, size);
+}
+
+void kokkosp_end_deep_copy() { timer.end_deep_copy(); }
+
+// --- Event Set Configuration ---
+
+Kokkos::Tools::Experimental::EventSet get_event_set() {
+  Kokkos::Tools::Experimental::EventSet my_event_set;
+  memset(&my_event_set, 0,
+         sizeof(my_event_set));  // zero any pointers not set here
+  my_event_set.init                  = kokkosp_init_library;
+  my_event_set.finalize              = kokkosp_finalize_library;
+  my_event_set.begin_deep_copy       = kokkosp_begin_deep_copy;
+  my_event_set.end_deep_copy         = kokkosp_end_deep_copy;
+  my_event_set.begin_parallel_for    = kokkosp_begin_parallel_for;
+  my_event_set.begin_parallel_reduce = kokkosp_begin_parallel_reduce;
+  my_event_set.begin_parallel_scan   = kokkosp_begin_parallel_scan;
+  my_event_set.end_parallel_for      = kokkosp_end_parallel_for;
+  my_event_set.end_parallel_reduce   = kokkosp_end_parallel_reduce;
+  my_event_set.end_parallel_scan     = kokkosp_end_parallel_scan;
+  my_event_set.push_region           = kokkosp_push_profile_region;
+  my_event_set.pop_region            = kokkosp_pop_profile_region;
+  return my_event_set;
+}
+
+}  // namespace VariorumPower
+}  // namespace KokkosTools
+
+extern "C" {
+
+namespace impl = KokkosTools::VariorumPower;
+
+EXPOSE_INIT(impl::kokkosp_init_library)
+EXPOSE_FINALIZE(impl::kokkosp_finalize_library)
+EXPOSE_BEGIN_PARALLEL_FOR(impl::kokkosp_begin_parallel_for)
+EXPOSE_END_PARALLEL_FOR(impl::kokkosp_end_parallel_for)
+EXPOSE_BEGIN_PARALLEL_SCAN(impl::kokkosp_begin_parallel_scan)
+EXPOSE_END_PARALLEL_SCAN(impl::kokkosp_end_parallel_scan)
+EXPOSE_BEGIN_PARALLEL_REDUCE(impl::kokkosp_begin_parallel_reduce)
+EXPOSE_END_PARALLEL_REDUCE(impl::kokkosp_end_parallel_reduce)
+EXPOSE_PUSH_REGION(impl::kokkosp_push_profile_region)
+EXPOSE_POP_REGION(impl::kokkosp_pop_profile_region)
+EXPOSE_BEGIN_DEEP_COPY(impl::kokkosp_begin_deep_copy)
+EXPOSE_END_DEEP_COPY(impl::kokkosp_end_deep_copy)
+}
\ No newline at end of file
diff --git a/profiling/energy-profiler/provider/provider_nvml.cpp b/profiling/energy-profiler/provider/provider_nvml.cpp
new file mode 100644
index 000000000..d5f35e6cd
--- /dev/null
+++ b/profiling/energy-profiler/provider/provider_nvml.cpp
@@ -0,0 +1,219 @@
+#include "provider_nvml.hpp"
+#include <nvml.h>
+#include <iostream>
+#include <cstring>
+
+NVMLProvider::NVMLProvider() : initialized_(false) {}
+
+NVMLProvider::~NVMLProvider() {
+  if (initialized_) {
+    finalize();
+  }
+}
+
+bool NVMLProvider::initialize() {
+  if (initialized_) {
+    return true;
+  }
+
+  // Initialize NVML
+  nvmlReturn_t result = nvmlInit();
+  if (NVML_SUCCESS != result) {
+    std::cerr << "NVML Provider: Failed to initialize NVML: "
+              << nvmlErrorString(result) << std::endl;
+    return false;
+  }
+
+  // Discover devices
+  if (!discover_devices()) {
+    nvmlShutdown();
+    return false;
+  }
+
+  initialized_ = true;
+  std::cout << "NVML Provider: Successfully initialized with "
+            << devices_.size() << " device(s)" << std::endl;
+
+  return true;
+}
+
+void NVMLProvider::finalize() {
+  if (!initialized_) {
+    return;
+  }
+
+  cleanup_devices();
+  nvmlShutdown();
+  initialized_ = false;
+
+  std::cout << "NVML Provider: Finalized" << std::endl;
+}
+
+double NVMLProvider::get_total_power_usage() {
+  if (!initialized_) {
+    return 0.0;
+  }
+
+  double total_power_W = 0.0;
+
+  for (size_t i = 0; i < devices_.size(); ++i) {
+    double device_power = get_device_power_usage(i);
+    if (device_power >= 0.0) {
+      total_power_W += device_power;
+    }
+  }
+
+  return total_power_W;
+}
+
+double NVMLProvider::get_device_power_usage(size_t device_index) {
+  if (!initialized_ || device_index >= devices_.size()) {
+    return -1.0;
+  }
+
+  if (devices_[device_index] == nullptr) {
+    return -1.0;
+  }
+
+  unsigned int power_mW = 0;
+  nvmlReturn_t result =
+      nvmlDeviceGetPowerUsage(devices_[device_index], &power_mW);
+
+  if (result == NVML_SUCCESS) {
+    // Convert from milliwatts to watts
+    return static_cast<double>(power_mW) / 1000.0;
+  } else {
+    std::cerr << "NVML Provider: Failed to get power usage for device "
+              << device_index << ": " << nvmlErrorString(result) << std::endl;
+    return -1.0;
+  }
+}
+
+double NVMLProvider::get_device_power_usage_direct(size_t device_index) {
+  if (!initialized_ || device_index >= devices_.size()) {
+    return -1.0;
+  }
+
+  if (devices_[device_index] == nullptr) {
+    return -1.0;
+  }
+
+  nvmlFieldValue_t powerFieldNow;
+  powerFieldNow.fieldId = NVML_FI_DEV_POWER_INSTANT;
+  if (nvmlDeviceGetFieldValues(devices_[device_index], 1, &powerFieldNow) !=
+      NVML_SUCCESS) {
+    std::cerr << "NVML power read failed — stopping measurement.\n";
+    return -1.0;
+  }
+  unsigned int pw = static_cast<unsigned int>(powerFieldNow.value.uiVal);
+  // Convert from milliwatts to watts
+  return static_cast<double>(pw) / 1000.0;
+}
+
+double NVMLProvider::get_current_energy_consumption(size_t device_index) {
+  if (!initialized_ || device_index >= devices_.size()) {
+    return -1.0;
+  }
+
+  if (devices_[device_index] == nullptr) {
+    return -1.0;
+  }
+
+  unsigned long long energy_joules = 0;
+  nvmlReturn_t result              = nvmlDeviceGetTotalEnergyConsumption(
+      devices_[device_index], &energy_joules);
+
+  if (result == NVML_SUCCESS) {
+    // Convert from millijoules to joules
+    return static_cast<double>(energy_joules) / 1000.0;
+  } else {
+    std::cerr << "NVML Provider: Failed to get energy consumption for device "
+              << device_index << ": " << nvmlErrorString(result) << std::endl;
+    return -1.0;
+  }
+}
+
+size_t NVMLProvider::get_device_count() const { return devices_.size(); }
+
+std::string NVMLProvider::get_device_name(size_t device_index) const {
+  if (device_index >= device_names_.size()) {
+    return "Unknown Device";
+  }
+  return device_names_[device_index];
+}
+
+bool NVMLProvider::discover_devices() {
+  unsigned int device_count;
+  nvmlReturn_t result = nvmlDeviceGetCount(&device_count);
+
+  if (NVML_SUCCESS != result) {
+    std::cerr << "NVML Provider: Failed to get device count: "
+              << nvmlErrorString(result) << std::endl;
+    return false;
+  }
+
+  if (device_count == 0) {
+    std::cerr << "NVML Provider: No NVIDIA devices found" << std::endl;
+    return false;
+  }
+
+  devices_.resize(device_count);
+  device_names_.resize(device_count);
+
+  std::cout << "NVML Provider: Found " << device_count << " NVIDIA device(s)"
+            << std::endl;
+
+  for (unsigned int i = 0; i < device_count; ++i) {
+    result = nvmlDeviceGetHandleByIndex(i, &devices_[i]);
+    if (NVML_SUCCESS != result) {
+      std::cerr << "NVML Provider: Failed to get handle for device " << i
+                << std::endl;
+      devices_[i]      = nullptr;
+      device_names_[i] = "Failed Device";
+      continue;
+    }
+
+    // Get device name
+    char device_name[NVML_DEVICE_NAME_BUFFER_SIZE];
+    result = nvmlDeviceGetName(devices_[i], device_name,
+                               NVML_DEVICE_NAME_BUFFER_SIZE);
+    if (NVML_SUCCESS == result) {
+      device_names_[i] = std::string(device_name);
+      std::cout << "NVML Provider: Device " << i << ": " << device_name
+                << std::endl;
+    } else {
+      device_names_[i] = "Unknown Device " + std::to_string(i);
+    }
+
+    // Check power management capability
+    nvmlEnableState_t pmmode;
+    result = nvmlDeviceGetPowerManagementMode(devices_[i], &pmmode);
+    if (NVML_SUCCESS == result && pmmode == NVML_FEATURE_ENABLED) {
+      std::cout << "NVML Provider: Device " << i << ": Power management enabled"
+                << std::endl;
+    } else {
+      std::cout << "NVML Provider: Device " << i
+                << ": Power management disabled or not supported" << std::endl;
+    }
+
+    // Test power usage reading
+    unsigned int test_power_mW = 0;
+    result = nvmlDeviceGetPowerUsage(devices_[i], &test_power_mW);
+    if (NVML_SUCCESS == result) {
+      std::cout << "NVML Provider: Device " << i
+                << ": Current power usage: " << (test_power_mW / 1000.0) << " W"
+                << std::endl;
+    } else {
+      std::cout << "NVML Provider: Device " << i
+                << ": Power usage reading failed: " << nvmlErrorString(result)
+                << std::endl;
+    }
+  }
+
+  return true;
+}
+
+void NVMLProvider::cleanup_devices() {
+  devices_.clear();
+  device_names_.clear();
+}
\ No newline at end of file
diff --git a/profiling/energy-profiler/provider/provider_nvml.hpp b/profiling/energy-profiler/provider/provider_nvml.hpp
new file mode 100644
index 000000000..488f6e68e
--- /dev/null
+++ b/profiling/energy-profiler/provider/provider_nvml.hpp
@@ -0,0 +1,49 @@
+#pragma once
+
+#include <vector>
+#include <string>
+#include <nvml.h>
+
+/**
+ * NVML Power Provider
+ * Simplified power monitoring using nvmlDeviceGetPowerUsage()
+ */
+class NVMLProvider {
+ public:
+  NVMLProvider();
+  ~NVMLProvider();
+
+  // Initialize NVML and discover devices
+  bool initialize();
+
+  // Cleanup NVML resources
+  void finalize();
+
+  // Get current power consumption in Watts for all devices
+  double get_total_power_usage();
+
+  // Get power usage for a specific device
+  double get_device_power_usage(size_t device_index);  // unit: Watts
+
+  double get_device_power_usage_direct(size_t device_index);  // unit: Watts
+
+  double get_current_energy_consumption(size_t device_index);  // unit: Joules
+
+  // Get number of available devices
+  size_t get_device_count() const;
+
+  // Get device name
+  std::string get_device_name(size_t device_index) const;
+
+  // Check if provider is initialized
+  bool is_initialized() const { return initialized_; }
+
+ private:
+  bool initialized_;
+  std::vector<nvmlDevice_t> devices_;
+  std::vector<std::string> device_names_;
+
+  // Helper methods
+  bool discover_devices();
+  void cleanup_devices();
+};
\ No newline at end of file
diff --git a/profiling/energy-profiler/provider/provider_variorum.cpp b/profiling/energy-profiler/provider/provider_variorum.cpp
new file mode 100644
index 000000000..a31f1eed2
--- /dev/null
+++ b/profiling/energy-profiler/provider/provider_variorum.cpp
@@ -0,0 +1,230 @@
+#include "provider_variorum.hpp"
+#include <iostream>
+#include <set>
+#include <cstring>
+
+VariorumProvider::VariorumProvider() : initialized_(false) {}
+
+VariorumProvider::~VariorumProvider() {
+  if (initialized_) {
+    finalize();
+  }
+}
+
+bool VariorumProvider::initialize() {
+  if (initialized_) {
+    return true;
+  }
+
+  // Initialize Variorum (in the original code, this was a no-op)
+  // For now, we'll assume Variorum is available and working
+
+  // Discover devices
+  if (!discover_devices()) {
+    return false;
+  }
+
+  initialized_ = true;
+  std::cout << "Variorum Provider: Successfully initialized with "
+            << device_ids_.size() << " device(s)" << std::endl;
+
+  return true;
+}
+
+void VariorumProvider::finalize() {
+  if (!initialized_) {
+    return;
+  }
+
+  cleanup_devices();
+  initialized_ = false;
+
+  std::cout << "Variorum Provider: Finalized" << std::endl;
+}
+
+double VariorumProvider::get_total_power_usage() {
+  if (!initialized_) {
+    return 0.0;
+  }
+
+  double total_power_W                      = 0.0;
+  std::map<uint32_t, double> power_readings = get_current_power_readings();
+
+  for (const auto& [device_id, power] : power_readings) {
+    if (power >= 0.0) {
+      total_power_W += power;
+    }
+  }
+
+  return total_power_W;
+}
+
+double VariorumProvider::get_device_power_usage(size_t device_index) {
+  if (!initialized_ || device_index >= device_ids_.size()) {
+    return -1.0;
+  }
+
+  uint32_t device_id                        = device_ids_[device_index];
+  std::map<uint32_t, double> power_readings = get_current_power_readings();
+
+  auto it = power_readings.find(device_id);
+  if (it != power_readings.end()) {
+    return it->second;
+  }
+
+  return -1.0;
+}
+
+size_t VariorumProvider::get_device_count() const { return device_ids_.size(); }
+
+std::string VariorumProvider::get_device_name(size_t device_index) const {
+  if (device_index >= device_names_.size()) {
+    return "Unknown Device";
+  }
+  return device_names_[device_index];
+}
+
+bool VariorumProvider::discover_devices() {
+  std::set<uint32_t> found_device_ids;
+  unique_json_ptr root = get_variorum_json_data();
+
+  if (!root) {
+    std::cerr << "Variorum Provider: Failed to get JSON data from Variorum"
+              << std::endl;
+    return false;
+  }
+
+  // Parse JSON to find GPU devices
+  json_t* host_obj = json_object_iter_value(json_object_iter(root.get()));
+  if (!host_obj) {
+    std::cerr << "Variorum Provider: No host object found in JSON" << std::endl;
+    return false;
+  }
+
+  json_t* socket_0 = json_object_get(host_obj, "socket_0");
+  if (socket_0 && json_is_object(socket_0)) {
+    json_t* power_gpu_watts = json_object_get(socket_0, "power_gpu_watts");
+    if (power_gpu_watts && json_is_object(power_gpu_watts)) {
+      const char* key;
+      json_t* value;
+      json_object_foreach(power_gpu_watts, key, value) {
+        std::string s_key(key);
+        if (s_key.length() > 4 && s_key.substr(0, 4) == "GPU_") {
+          try {
+            uint32_t device_id = std::stoul(s_key.substr(4));
+            found_device_ids.insert(device_id);
+          } catch (const std::exception& e) {
+            std::cerr << "Variorum Provider: Could not parse GPU ID from key: "
+                      << s_key << " (" << e.what() << ")" << std::endl;
+          }
+        }
+      }
+    }
+  }
+
+  if (found_device_ids.empty()) {
+    std::cerr << "Variorum Provider: No GPU devices found" << std::endl;
+    return false;
+  }
+
+  // Store device information
+  device_ids_.clear();
+  device_names_.clear();
+
+  for (uint32_t device_id : found_device_ids) {
+    device_ids_.push_back(device_id);
+    device_names_.push_back("GPU_" + std::to_string(device_id));
+
+    std::cout << "Variorum Provider: Found device " << device_ids_.size() - 1
+              << ": GPU_" << device_id << std::endl;
+  }
+
+  // Test initial power readings
+  std::cout << "Variorum Provider: Testing initial power readings..."
+            << std::endl;
+  std::map<uint32_t, double> test_readings = get_current_power_readings();
+  for (size_t i = 0; i < device_ids_.size(); ++i) {
+    uint32_t device_id = device_ids_[i];
+    auto it            = test_readings.find(device_id);
+    if (it != test_readings.end()) {
+      std::cout << "Variorum Provider: Device " << i
+                << ": Current power usage: " << it->second << " W" << std::endl;
+    } else {
+      std::cout << "Variorum Provider: Device " << i << ": Power reading failed"
+                << std::endl;
+    }
+  }
+
+  return true;
+}
+
+void VariorumProvider::cleanup_devices() {
+  device_ids_.clear();
+  device_names_.clear();
+}
+
+VariorumProvider::unique_json_ptr VariorumProvider::get_variorum_json_data()
+    const {
+  char* json_string_c_raw = nullptr;
+  int variorum_error      = variorum_get_power_json(&json_string_c_raw);
+
+  if (variorum_error != 0) {
+    std::cerr
+        << "Variorum Provider: variorum_get_power_json() failed. Error code: "
+        << variorum_error << std::endl;
+    return unique_json_ptr(nullptr);
+  }
+
+  unique_cstring json_string_c(json_string_c_raw);
+
+  if (!json_string_c) {
+    std::cerr
+        << "Variorum Provider: variorum_get_power_json() returned success "
+           "but a null pointer."
+        << std::endl;
+    return unique_json_ptr(nullptr);
+  }
+
+  json_error_t error;
+  json_t* root_ptr = json_loads(json_string_c.get(), 0, &error);
+
+  if (!root_ptr) {
+    std::cerr << "Variorum Provider: Failed to parse JSON: " << error.text
+              << std::endl;
+    return unique_json_ptr(nullptr);
+  }
+
+  return unique_json_ptr(root_ptr);
+}
+
+std::map<uint32_t, double> VariorumProvider::get_current_power_readings()
+    const {
+  std::map<uint32_t, double> readings;
+
+  unique_json_ptr root = get_variorum_json_data();
+  if (!root) {
+    return readings;
+  }
+
+  json_t* host_obj = json_object_iter_value(json_object_iter(root.get()));
+  if (!host_obj) {
+    return readings;
+  }
+
+  json_t* socket_0 = json_object_get(host_obj, "socket_0");
+  if (socket_0 && json_is_object(socket_0)) {
+    json_t* power_gpu_watts = json_object_get(socket_0, "power_gpu_watts");
+    if (power_gpu_watts && json_is_object(power_gpu_watts)) {
+      for (uint32_t device_id : device_ids_) {
+        std::string gpu_key = "GPU_" + std::to_string(device_id);
+        json_t* power_value = json_object_get(power_gpu_watts, gpu_key.c_str());
+
+        if (json_is_number(power_value)) {
+          readings[device_id] = json_number_value(power_value);
+        }
+      }
+    }
+  }
+
+  return readings;
+}
\ No newline at end of file
diff --git a/profiling/energy-profiler/provider/provider_variorum.hpp b/profiling/energy-profiler/provider/provider_variorum.hpp
new file mode 100644
index 000000000..bdf470a07
--- /dev/null
+++ b/profiling/energy-profiler/provider/provider_variorum.hpp
@@ -0,0 +1,56 @@
+#pragma once
+
+#include <string>
+#include <vector>
+#include <map>
+#include <memory>
+
+extern "C" {
+#include <variorum.h>
+#include <jansson.h>
+}
+
+class VariorumProvider {
+ public:
+  VariorumProvider();
+  ~VariorumProvider();
+
+  // Core functionality
+  bool initialize();
+  void finalize();
+  bool is_initialized() const { return initialized_; }
+
+  // Power monitoring
+  double get_total_power_usage();
+  double get_device_power_usage(size_t device_index);
+
+  // Device information
+  size_t get_device_count() const;
+  std::string get_device_name(size_t device_index) const;
+
+ private:
+  struct JsonDeleter {
+    void operator()(json_t* json) const {
+      if (json) json_decref(json);
+    }
+  };
+  using unique_json_ptr = std::unique_ptr<json_t, JsonDeleter>;
+
+  struct CFreeDeleter {
+    void operator()(char* ptr) const {
+      if (ptr) free(ptr);
+    }
+  };
+  using unique_cstring = std::unique_ptr<char, CFreeDeleter>;
+
+  // Internal methods
+  bool discover_devices();
+  void cleanup_devices();
+  unique_json_ptr get_variorum_json_data() const;
+  std::map<uint32_t, double> get_current_power_readings() const;
+
+  // Member variables
+  bool initialized_;
+  std::vector<uint32_t> device_ids_;
+  std::vector<std::string> device_names_;
+};
\ No newline at end of file
diff --git a/profiling/energy-profiler/tests/CMakeLists.txt b/profiling/energy-profiler/tests/CMakeLists.txt
new file mode 100644
index 000000000..da391e9e9
--- /dev/null
+++ b/profiling/energy-profiler/tests/CMakeLists.txt
@@ -0,0 +1,152 @@
+# Tests for Energy Profiler
+
+enable_testing()
+
+# Find Threads package for pthread support (needed by std::thread in daemon.cpp)
+find_package(Threads REQUIRED)
+
+# Daemon test
+add_executable(daemon_test
+    daemon_test.cpp
+    ../common/daemon.cpp
+)
+
+target_include_directories(daemon_test PRIVATE
+    ../common
+)
+
+target_link_libraries(daemon_test PRIVATE Threads::Threads)
+
+add_test(NAME daemon_test COMMAND daemon_test)
+
+# Find CUDA Toolkit and NVML for NVML tests
+find_package(CUDAToolkit QUIET)
+if(CUDAToolkit_FOUND)
+    find_package(CUDA::nvml QUIET)
+    if(TARGET CUDA::nvml)
+        message(STATUS "Found CUDA NVML, making NVML tests available.")
+        
+        # NVML Provider test
+        add_executable(nvml_provider_test
+            nvml_provider_test.cpp
+            ../provider/provider_nvml.cpp
+        )
+
+        target_include_directories(nvml_provider_test PRIVATE
+            ../provider
+        )
+        
+        target_link_libraries(nvml_provider_test PRIVATE CUDA::nvml)
+        add_test(NAME nvml_provider_test COMMAND nvml_provider_test)
+
+        # Daemon NVML Integration test
+        add_executable(daemon_nvml_integration_test
+            daemon_nvml_integration_test.cpp
+            ../common/daemon.cpp
+            ../provider/provider_nvml.cpp
+        )
+
+        target_include_directories(daemon_nvml_integration_test PRIVATE
+            ../common
+            ../provider
+        )
+
+        target_link_libraries(daemon_nvml_integration_test PRIVATE CUDA::nvml Threads::Threads)
+        add_test(NAME daemon_nvml_integration_test COMMAND daemon_nvml_integration_test)
+
+        # Fast Daemon NVML Integration test (20ms sampling)
+        add_executable(daemon_nvml_fast_test
+            daemon_nvml_fast_test.cpp
+            ../common/daemon.cpp
+            ../provider/provider_nvml.cpp
+        )
+
+        target_include_directories(daemon_nvml_fast_test PRIVATE
+            ../common
+            ../provider
+        )
+
+        target_link_libraries(daemon_nvml_fast_test PRIVATE CUDA::nvml Threads::Threads)
+        add_test(NAME daemon_nvml_fast_test COMMAND daemon_nvml_fast_test)
+    else()
+        message(STATUS "CUDA::nvml target not found. NVML tests will be skipped.")
+    endif()
+else()
+    message(STATUS "CUDAToolkit not found, NVML tests will be skipped.")
+endif()
+
+# Variorum Provider test
+if(KOKKOSTOOLS_HAS_VARIORUM)
+    message(STATUS "Using Variorum configuration from root CMake, making Variorum tests available.")
+    
+    add_executable(variorum_provider_test
+        variorum_provider_test.cpp
+        ../provider/provider_variorum.cpp
+    )
+
+    target_include_directories(variorum_provider_test PRIVATE
+        ../provider
+    )
+
+    target_link_libraries(variorum_provider_test PRIVATE variorum::variorum)
+    add_test(NAME variorum_provider_test COMMAND variorum_provider_test)
+
+    # Daemon Variorum Integration test
+    add_executable(daemon_variorum_integration_test
+        daemon_variorum_integration_test.cpp
+        ../common/daemon.cpp
+        ../provider/provider_variorum.cpp
+    )
+
+    target_include_directories(daemon_variorum_integration_test PRIVATE
+        ../common
+        ../provider
+    )
+
+    target_link_libraries(daemon_variorum_integration_test PRIVATE variorum::variorum Threads::Threads)
+    add_test(NAME daemon_variorum_integration_test COMMAND daemon_variorum_integration_test)
+
+    # Fast Daemon Variorum Integration test
+    add_executable(daemon_variorum_fast_test
+        daemon_variorum_fast_test.cpp
+        ../common/daemon.cpp
+        ../provider/provider_variorum.cpp
+    )
+
+    target_include_directories(daemon_variorum_fast_test PRIVATE
+        ../common
+        ../provider
+    )
+
+    target_link_libraries(daemon_variorum_fast_test PRIVATE variorum::variorum Threads::Threads)
+    add_test(NAME daemon_variorum_fast_test COMMAND daemon_variorum_fast_test)
+else()
+    message(STATUS "Variorum not available from root CMake, Variorum tests will be skipped.")
+endif()
+
+add_executable(timer_test 
+    timer_test.cpp
+    ../common/timer.cpp
+)
+
+target_include_directories(timer_test PRIVATE
+    ../common
+    ../tools
+)
+
+add_test(NAME timer_test COMMAND timer_test)
+
+# CSV export test
+add_executable(csv_export_test
+    csv_export_test.cpp
+    ../common/timer.cpp
+    ../common/filename_prefix.cpp
+    ../tools/kernel_timer_tool.cpp
+)
+
+target_include_directories(csv_export_test PRIVATE
+    ../common
+    ../tools
+)
+
+add_test(NAME csv_export_test COMMAND csv_export_test)
\ No newline at end of file
diff --git a/profiling/energy-profiler/tests/csv_export_test.cpp b/profiling/energy-profiler/tests/csv_export_test.cpp
new file mode 100644
index 000000000..366f33795
--- /dev/null
+++ b/profiling/energy-profiler/tests/csv_export_test.cpp
@@ -0,0 +1,68 @@
+//@HEADER
+// ************************************************************************
+//
+//                        Kokkos v. 4.0
+//       Copyright (2022) National Technology & Engineering
+//               Solutions of Sandia, LLC (NTESS).
+//
+// Under the terms of Contract DE-NA0003525 with NTESS,
+// the U.S. Government retains certain rights in this software.
+//
+// Part of Kokkos, under the Apache License v2.0 with LLVM Exceptions.
+// See https://kokkos.org/LICENSE for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//@HEADER
+
+#include <iostream>
+#include <deque>
+#include <chrono>
+#include <thread>
+#include "../common/timer.hpp"
+#include "../tools/kernel_timer_tool.hpp"
+
+int main() {
+  std::cout << "Testing CSV export functions..." << std::endl;
+
+  KernelTimerTool timer;
+
+  // Simulate some kernel operations
+  timer.start_region("test_kernel_1", RegionType::ParallelFor, 1);
+  std::this_thread::sleep_for(std::chrono::milliseconds(10));
+  timer.end_region();
+
+  timer.start_region("test_kernel_2", RegionType::ParallelReduce, 2);
+  std::this_thread::sleep_for(std::chrono::milliseconds(5));
+  timer.end_region();
+
+  timer.start_region("test_region", RegionType::UserRegion, 3);
+  std::this_thread::sleep_for(std::chrono::milliseconds(15));
+  timer.end_region();
+
+  timer.start_region("test_deepcopy", RegionType::DeepCopy, 4);
+  std::this_thread::sleep_for(std::chrono::milliseconds(8));
+  timer.end_region();
+
+  // Test the CSV export functions
+  const auto& kernels    = timer.get_kernel_timings();
+  const auto& regions    = timer.get_region_timings();
+  const auto& deepcopies = timer.get_deep_copy_timings();
+
+  std::cout << "Found " << kernels.size() << " kernels" << std::endl;
+  std::cout << "Found " << regions.size() << " regions" << std::endl;
+  std::cout << "Found " << deepcopies.size() << " deep copies" << std::endl;
+
+  // Test export functions
+  KokkosTools::Timer::export_kernels_csv(kernels, "test_kernels.csv");
+  KokkosTools::Timer::export_regions_csv(regions, "test_regions.csv");
+  KokkosTools::Timer::export_deepcopies_csv(deepcopies, "test_deepcopies.csv");
+
+  // Test print functions
+  KokkosTools::Timer::print_kernels_summary(kernels);
+  KokkosTools::Timer::print_regions_summary(regions);
+  KokkosTools::Timer::print_deepcopies_summary(deepcopies);
+
+  std::cout << "CSV export test completed successfully!" << std::endl;
+
+  return 0;
+}
diff --git a/profiling/energy-profiler/tests/daemon_nvml_fast_test.cpp b/profiling/energy-profiler/tests/daemon_nvml_fast_test.cpp
new file mode 100644
index 000000000..20a62b864
--- /dev/null
+++ b/profiling/energy-profiler/tests/daemon_nvml_fast_test.cpp
@@ -0,0 +1,226 @@
+#include <iostream>
+#include <chrono>
+#include <thread>
+#include <atomic>
+#include <iomanip>
+#include <vector>
+#include <algorithm>
+#include <numeric>
+#include <mutex>
+#include <cmath>
+#include "../common/daemon.hpp"
+#include "../provider/provider_nvml.hpp"
+
+// Global variables for the monitoring function
+static NVMLProvider* g_nvml_provider = nullptr;
+static std::atomic<uint32_t> g_sample_count{0};
+static std::atomic<double> g_total_energy{0.0};
+static std::atomic<double> g_min_power{std::numeric_limits<double>::max()};
+static std::atomic<double> g_max_power{0.0};
+static std::vector<double> g_power_samples;
+static std::mutex g_samples_mutex;
+
+void fast_power_monitoring_function() {
+  if (!g_nvml_provider || !g_nvml_provider->is_initialized()) {
+    return;
+  }
+
+  double current_power = g_nvml_provider->get_total_power_usage();
+
+  // Update statistics atomically
+  g_sample_count.fetch_add(1);
+
+  // Accumulate energy (Power * Time)
+  // Since we sample every 20ms, energy increment = power * 0.02 seconds
+  double expected = g_total_energy.load();
+  while (!g_total_energy.compare_exchange_weak(
+      expected, expected + current_power * 0.02)) {
+    // Loop until successful update
+  }
+
+  // Update min power
+  double current_min = g_min_power.load();
+  while (current_power < current_min &&
+         !g_min_power.compare_exchange_weak(current_min, current_power)) {
+    // Loop until successful update
+  }
+
+  // Update max power
+  double current_max = g_max_power.load();
+  while (current_power > current_max &&
+         !g_max_power.compare_exchange_weak(current_max, current_power)) {
+    // Loop until successful update
+  }
+
+  // Store sample for statistical analysis (thread-safe)
+  {
+    std::lock_guard<std::mutex> lock(g_samples_mutex);
+    g_power_samples.push_back(current_power);
+  }
+}
+
+double calculate_standard_deviation(const std::vector<double>& samples,
+                                    double mean) {
+  if (samples.size() <= 1) return 0.0;
+
+  double sum_squared_diff = 0.0;
+  for (double sample : samples) {
+    double diff = sample - mean;
+    sum_squared_diff += diff * diff;
+  }
+
+  return std::sqrt(sum_squared_diff / (samples.size() - 1));
+}
+
+bool test_daemon_nvml_fast_integration() {
+  std::cout << "=== Fast Daemon + NVML Integration Test (20ms sampling) ==="
+            << std::endl;
+
+  // Reset global counters
+  g_sample_count = 0;
+  g_total_energy = 0.0;
+  g_min_power    = std::numeric_limits<double>::max();
+  g_max_power    = 0.0;
+  g_power_samples.clear();
+
+  // Initialize NVML provider
+  std::cout << "\n1. Initializing NVML provider..." << std::endl;
+  NVMLProvider nvml_provider;
+  if (!nvml_provider.initialize()) {
+    std::cout << "ERROR: Failed to initialize NVML provider" << std::endl;
+    return false;
+  }
+
+  g_nvml_provider = &nvml_provider;
+  std::cout << "SUCCESS: NVML provider initialized with "
+            << nvml_provider.get_device_count() << " device(s)" << std::endl;
+
+  // Create daemon with 20ms interval
+  std::cout << "\n2. Creating daemon with 20ms monitoring interval..."
+            << std::endl;
+  Daemon power_daemon(fast_power_monitoring_function, 20);
+
+  // Start monitoring
+  std::cout << "\n3. Starting fast power monitoring..." << std::endl;
+  power_daemon.start();
+  std::cout << "SUCCESS: Fast power monitoring started" << std::endl;
+
+  // Let it run for 10 seconds
+  std::cout << "\n4. Monitoring for 2 seconds (high frequency sampling)..."
+            << std::endl;
+  std::cout << "   (No real-time output to avoid saturation)" << std::endl;
+
+  auto start_time = std::chrono::high_resolution_clock::now();
+  std::this_thread::sleep_for(std::chrono::seconds(2));
+  auto end_time = std::chrono::high_resolution_clock::now();
+
+  // Stop monitoring
+  std::cout << "\n5. Stopping power monitoring..." << std::endl;
+  power_daemon.stop();
+  std::cout << "SUCCESS: Power monitoring stopped" << std::endl;
+
+  // Calculate actual monitoring duration
+  auto actual_duration = std::chrono::duration_cast<std::chrono::milliseconds>(
+      end_time - start_time);
+
+  // Analyze collected data
+  std::cout << "\n=== Statistical Analysis ===" << std::endl;
+
+  uint32_t total_samples = g_sample_count.load();
+  double total_energy    = g_total_energy.load();
+  double min_power       = g_min_power.load();
+  double max_power       = g_max_power.load();
+
+  std::cout << "Monitoring duration: " << actual_duration.count() << " ms"
+            << std::endl;
+  std::cout << "Total samples collected: " << total_samples << std::endl;
+  std::cout << "Expected samples (50 Hz): " << (actual_duration.count() / 20)
+            << std::endl;
+  std::cout << "Sampling efficiency: " << std::fixed << std::setprecision(1)
+            << (100.0 * total_samples / (actual_duration.count() / 20.0)) << "%"
+            << std::endl;
+
+  if (total_samples > 0) {
+    double avg_power = total_energy / (total_samples * 0.02);
+
+    std::cout << "\n=== Power Statistics ===" << std::endl;
+    std::cout << "Average power: " << std::fixed << std::setprecision(2)
+              << avg_power << " W" << std::endl;
+    std::cout << "Minimum power: " << std::fixed << std::setprecision(2)
+              << min_power << " W" << std::endl;
+    std::cout << "Maximum power: " << std::fixed << std::setprecision(2)
+              << max_power << " W" << std::endl;
+    std::cout << "Power range: " << std::fixed << std::setprecision(2)
+              << (max_power - min_power) << " W" << std::endl;
+    std::cout << "Total energy consumed: " << std::fixed << std::setprecision(3)
+              << total_energy << " J" << std::endl;
+
+    // Calculate additional statistics from stored samples
+    {
+      std::lock_guard<std::mutex> lock(g_samples_mutex);
+      if (!g_power_samples.empty()) {
+        std::sort(g_power_samples.begin(), g_power_samples.end());
+
+        size_t n = g_power_samples.size();
+        double median =
+            (n % 2 == 0)
+                ? (g_power_samples[n / 2 - 1] + g_power_samples[n / 2]) / 2.0
+                : g_power_samples[n / 2];
+
+        double q1 = g_power_samples[n / 4];
+        double q3 = g_power_samples[3 * n / 4];
+
+        double std_dev =
+            calculate_standard_deviation(g_power_samples, avg_power);
+
+        std::cout << "\n=== Extended Statistics ===" << std::endl;
+        std::cout << "Median power: " << std::fixed << std::setprecision(2)
+                  << median << " W" << std::endl;
+        std::cout << "Q1 (25th percentile): " << std::fixed
+                  << std::setprecision(2) << q1 << " W" << std::endl;
+        std::cout << "Q3 (75th percentile): " << std::fixed
+                  << std::setprecision(2) << q3 << " W" << std::endl;
+        std::cout << "Standard deviation: " << std::fixed
+                  << std::setprecision(2) << std_dev << " W" << std::endl;
+        std::cout << "Coefficient of variation: " << std::fixed
+                  << std::setprecision(1) << (100.0 * std_dev / avg_power)
+                  << "%" << std::endl;
+      }
+    }
+
+    // Show per-device breakdown if multiple devices
+    size_t device_count = nvml_provider.get_device_count();
+    if (device_count > 1) {
+      std::cout << "\n=== Per-Device Final Readings ===" << std::endl;
+      for (size_t i = 0; i < device_count; ++i) {
+        double device_power     = nvml_provider.get_device_power_usage(i);
+        std::string device_name = nvml_provider.get_device_name(i);
+        std::cout << "  " << device_name << ": " << std::fixed
+                  << std::setprecision(2) << device_power << " W" << std::endl;
+      }
+    }
+  }
+
+  // Cleanup
+  std::cout << "\n6. Cleaning up..." << std::endl;
+  g_nvml_provider = nullptr;
+  nvml_provider.finalize();
+  std::cout << "SUCCESS: Cleanup completed" << std::endl;
+
+  return true;
+}
+
+int main() {
+  try {
+    if (test_daemon_nvml_fast_integration()) {
+      std::cout << "\nFast integration test PASSED!" << std::endl;
+      return 0;
+    } else {
+      std::cout << "\nFast integration test FAILED!" << std::endl;
+      return 1;
+    }
+  } catch (const std::exception& e) {
+    std::cerr << "\nTest failed with exception: " << e.what() << std::endl;
+    return 1;
+  }
+}
\ No newline at end of file
diff --git a/profiling/energy-profiler/tests/daemon_nvml_integration_test.cpp b/profiling/energy-profiler/tests/daemon_nvml_integration_test.cpp
new file mode 100644
index 000000000..9f052c2b4
--- /dev/null
+++ b/profiling/energy-profiler/tests/daemon_nvml_integration_test.cpp
@@ -0,0 +1,123 @@
+#include <iostream>
+#include <chrono>
+#include <thread>
+#include <atomic>
+#include <iomanip>
+#include "../common/daemon.hpp"
+#include "../provider/provider_nvml.hpp"
+
+// Global variables for the monitoring function
+static NVMLProvider* g_nvml_provider = nullptr;
+static std::atomic<uint32_t> g_sample_count{0};
+static std::atomic<double> g_total_energy{0.0};
+static std::atomic<double> g_last_power{0.0};
+
+void power_monitoring_function() {
+  if (!g_nvml_provider || !g_nvml_provider->is_initialized()) {
+    std::cout << "ERROR: NVML provider not initialized" << std::endl;
+    return;
+  }
+
+  double current_power = g_nvml_provider->get_total_power_usage();
+  g_last_power.store(current_power);
+
+  // Accumulate energy (Power * Time)
+  // Since we sample every 1000ms, energy increment = power * 1.0 seconds
+  double expected = g_total_energy.load();
+  while (!g_total_energy.compare_exchange_weak(
+      expected, expected + current_power * 1.0)) {
+    // Loop until successful update
+  }
+
+  uint32_t sample_num = g_sample_count.fetch_add(1) + 1;
+
+  std::cout << std::fixed << std::setprecision(2) << "Sample #" << sample_num
+            << " - Power: " << current_power << " W"
+            << " - Total Energy: " << g_total_energy.load() << " J"
+            << std::endl;
+
+  // Display individual device power if multiple devices
+  size_t device_count = g_nvml_provider->get_device_count();
+  if (device_count > 1) {
+    for (size_t i = 0; i < device_count; ++i) {
+      double device_power = g_nvml_provider->get_device_power_usage(i);
+      if (device_power >= 0.0) {
+        std::cout << "  Device " << i << " ("
+                  << g_nvml_provider->get_device_name(i)
+                  << "): " << device_power << " W" << std::endl;
+      }
+    }
+  }
+}
+
+bool test_daemon_nvml_integration() {
+  std::cout << "=== Daemon + NVML Integration Test ===" << std::endl;
+
+  // Initialize NVML provider
+  std::cout << "\n1. Initializing NVML provider..." << std::endl;
+  NVMLProvider nvml_provider;
+  if (!nvml_provider.initialize()) {
+    std::cout << "ERROR: Failed to initialize NVML provider" << std::endl;
+    return false;
+  }
+
+  g_nvml_provider = &nvml_provider;
+  std::cout << "SUCCESS: NVML provider initialized with "
+            << nvml_provider.get_device_count() << " device(s)" << std::endl;
+
+  // Create daemon with 1-second interval
+  std::cout << "\n2. Creating daemon with 1-second monitoring interval..."
+            << std::endl;
+  Daemon power_daemon(power_monitoring_function, 1000);
+
+  // Start monitoring
+  std::cout << "\n3. Starting power monitoring..." << std::endl;
+  power_daemon.start();
+  std::cout << "SUCCESS: Power monitoring started" << std::endl;
+
+  // Let it run for 2 seconds
+  std::cout << "\n4. Monitoring for 2 seconds..." << std::endl;
+  std::this_thread::sleep_for(std::chrono::seconds(2));
+
+  // Stop monitoring
+  std::cout << "\n5. Stopping power monitoring..." << std::endl;
+  power_daemon.stop();
+  std::cout << "SUCCESS: Power monitoring stopped" << std::endl;
+
+  // Display final statistics
+  std::cout << "\n=== Final Statistics ===" << std::endl;
+  std::cout << "Total samples: " << g_sample_count.load() << std::endl;
+  std::cout << "Last power reading: " << std::fixed << std::setprecision(2)
+            << g_last_power.load() << " W" << std::endl;
+  std::cout << "Total energy consumed: " << std::fixed << std::setprecision(2)
+            << g_total_energy.load() << " J" << std::endl;
+
+  if (g_sample_count.load() > 0) {
+    double avg_power = g_total_energy.load() / g_sample_count.load();
+    std::cout << "Average power: " << std::fixed << std::setprecision(2)
+              << avg_power << " W" << std::endl;
+  }
+
+  // Cleanup
+  std::cout << "\n6. Cleaning up..." << std::endl;
+  g_nvml_provider = nullptr;
+  nvml_provider.finalize();
+  std::cout << "SUCCESS: Cleanup completed" << std::endl;
+
+  return true;
+}
+
+int main() {
+  try {
+    if (test_daemon_nvml_integration()) {
+      std::cout << "\nIntegration test PASSED!" << std::endl;
+      return 0;
+    } else {
+      std::cout << "\nIntegration test FAILED!" << std::endl;
+      return 1;
+    }
+  } catch (const std::exception& e) {
+    std::cerr << "\nTest failed with exception: " << e.what() << std::endl;
+    return 1;
+  }
+}
\ No newline at end of file
diff --git a/profiling/energy-profiler/tests/daemon_test.cpp b/profiling/energy-profiler/tests/daemon_test.cpp
new file mode 100644
index 000000000..f33a6da01
--- /dev/null
+++ b/profiling/energy-profiler/tests/daemon_test.cpp
@@ -0,0 +1,221 @@
+#include <iostream>
+#include <chrono>
+#include <thread>
+#include <atomic>
+#include <cassert>
+#include <stdexcept>
+#include "../common/daemon.hpp"
+
+// Test counters and flags
+static std::atomic<uint32_t> counter{0};
+static std::atomic<uint32_t> fast_counter{0};
+static std::atomic<uint32_t> slow_counter{0};
+static std::atomic<bool> exception_thrown{false};
+
+// Test functions
+void hello_world() {
+  std::cout << "Hello World (execution #" << counter.load() + 1 << ")"
+            << std::endl;
+  counter++;
+}
+
+void fast_function() {
+  fast_counter++;
+  // Very fast function (< 1ms)
+}
+
+void slow_function() {
+  slow_counter++;
+  // Simulate a function that takes longer than interval
+  std::this_thread::sleep_for(std::chrono::milliseconds(150));
+}
+
+void exception_function() {
+  exception_thrown = true;
+  throw std::runtime_error("Test exception in daemon function");
+}
+
+// Test utilities
+bool test_basic_functionality() {
+  std::cout << "\n=== Test 1: Basic Functionality ===" << std::endl;
+
+  counter = 0;
+  Daemon daemon(hello_world, 100);
+
+  // Test initial state
+  assert(!daemon.is_running());
+
+  std::cout << "Starting daemon..." << std::endl;
+  daemon.start();
+
+  // Test running state
+  assert(daemon.is_running());
+
+  // Let it run for ~350ms (should execute ~3-4 times)
+  std::this_thread::sleep_for(std::chrono::milliseconds(350));
+
+  daemon.stop();
+
+  // Test stopped state
+  assert(!daemon.is_running());
+
+  uint32_t final_count = counter.load();
+  std::cout << "Daemon finished. Counter: " << final_count << std::endl;
+
+  // Should have executed 3-4 times (allowing some tolerance for timing)
+  bool success = (final_count >= 3 && final_count <= 4);
+  std::cout << "Test 1 " << (success ? "PASSED" : "FAILED") << std::endl;
+  return success;
+}
+
+bool test_timing_accuracy() {
+  std::cout << "\n=== Test 2: Timing Accuracy ===" << std::endl;
+
+  fast_counter = 0;
+  Daemon daemon(fast_function, 50);  // 50ms interval
+
+  auto start_time = std::chrono::high_resolution_clock::now();
+  daemon.start();
+
+  // Run for exactly 250ms
+  std::this_thread::sleep_for(std::chrono::milliseconds(250));
+
+  daemon.stop();
+  auto end_time = std::chrono::high_resolution_clock::now();
+
+  uint32_t executions  = fast_counter.load();
+  auto actual_duration = std::chrono::duration_cast<std::chrono::milliseconds>(
+      end_time - start_time);
+
+  std::cout << "Executions: " << executions << std::endl;
+  std::cout << "Actual duration: " << actual_duration.count() << "ms"
+            << std::endl;
+
+  // Should execute ~5 times (250ms / 50ms = 5)
+  bool success = (executions >= 4 && executions <= 6);
+  std::cout << "Test 2 " << (success ? "PASSED" : "FAILED") << std::endl;
+  return success;
+}
+
+bool test_slow_function_handling() {
+  std::cout << "\n=== Test 3: Slow Function Handling ===" << std::endl;
+
+  slow_counter = 0;
+  Daemon daemon(slow_function,
+                100);  // 100ms interval, but function takes 150ms
+
+  auto start_time = std::chrono::high_resolution_clock::now();
+  daemon.start();
+
+  // Run for 400ms
+  std::this_thread::sleep_for(std::chrono::milliseconds(400));
+
+  daemon.stop();
+  auto end_time = std::chrono::high_resolution_clock::now();
+
+  uint32_t executions  = slow_counter.load();
+  auto actual_duration = std::chrono::duration_cast<std::chrono::milliseconds>(
+      end_time - start_time);
+
+  std::cout << "Executions: " << executions << std::endl;
+  std::cout << "Actual duration: " << actual_duration.count() << "ms"
+            << std::endl;
+
+  // Should execute 2-3 times (each execution takes ~150ms, total time ~400ms)
+  bool success = (executions >= 2 && executions <= 3);
+  std::cout << "Test 3 " << (success ? "PASSED" : "FAILED") << std::endl;
+  return success;
+}
+
+bool test_start_stop_edge_cases() {
+  std::cout << "\n=== Test 4: Start/Stop Edge Cases ===" << std::endl;
+
+  Daemon daemon(hello_world, 1000);
+  bool success = true;
+
+  // Test double start
+  try {
+    daemon.start();
+    daemon.start();  // Should throw
+    success = false;
+    std::cout << "ERROR: Double start should have thrown exception"
+              << std::endl;
+  } catch (const std::runtime_error& e) {
+    std::cout << "Double start correctly threw: " << e.what() << std::endl;
+  }
+
+  daemon.stop();
+
+  // Test double stop
+  try {
+    daemon.stop();  // Should throw
+    success = false;
+    std::cout << "ERROR: Double stop should have thrown exception" << std::endl;
+  } catch (const std::runtime_error& e) {
+    std::cout << "Double stop correctly threw: " << e.what() << std::endl;
+  }
+
+  // Test stop without start
+  Daemon daemon2(hello_world, 1000);
+  try {
+    daemon2.stop();  // Should throw
+    success = false;
+    std::cout << "ERROR: Stop without start should have thrown exception"
+              << std::endl;
+  } catch (const std::runtime_error& e) {
+    std::cout << "Stop without start correctly threw: " << e.what()
+              << std::endl;
+  }
+
+  std::cout << "Test 4 " << (success ? "PASSED" : "FAILED") << std::endl;
+  return success;
+}
+
+bool test_thread_safety() {
+  std::cout << "\n=== Test 5: Thread Safety ===" << std::endl;
+
+  counter = 0;
+  Daemon daemon(hello_world, 200);  // Fast interval
+
+  daemon.start();
+
+  // Check is_running from main thread while daemon is running
+  bool running_check1 = daemon.is_running();
+  std::this_thread::sleep_for(std::chrono::milliseconds(50));
+  bool running_check2 = daemon.is_running();
+
+  daemon.stop();
+  bool running_check3 = daemon.is_running();
+
+  bool success = running_check1 && running_check2 && !running_check3;
+  std::cout << "Running state checks: " << running_check1 << ", "
+            << running_check2 << ", " << running_check3 << std::endl;
+  std::cout << "Executions during test: " << counter.load() << std::endl;
+  std::cout << "Test 5 " << (success ? "PASSED" : "FAILED") << std::endl;
+  return success;
+}
+
+int main() {
+  std::cout << "=== Daemon Comprehensive Test Suite ===" << std::endl;
+
+  int passed = 0;
+  int total  = 5;
+
+  if (test_basic_functionality()) passed++;
+  if (test_timing_accuracy()) passed++;
+  if (test_slow_function_handling()) passed++;
+  if (test_start_stop_edge_cases()) passed++;
+  if (test_thread_safety()) passed++;
+
+  std::cout << "\n=== Test Results ===" << std::endl;
+  std::cout << "Passed: " << passed << "/" << total << std::endl;
+
+  if (passed == total) {
+    std::cout << "ALL TESTS PASSED! Daemon is working correctly." << std::endl;
+    return 0;
+  } else {
+    std::cout << "Some tests failed. Please check the daemon implementation."
+              << std::endl;
+    return 1;
+  }
+}
\ No newline at end of file
diff --git a/profiling/energy-profiler/tests/daemon_variorum_fast_test.cpp b/profiling/energy-profiler/tests/daemon_variorum_fast_test.cpp
new file mode 100644
index 000000000..747fa3368
--- /dev/null
+++ b/profiling/energy-profiler/tests/daemon_variorum_fast_test.cpp
@@ -0,0 +1,227 @@
+#include <cmath>
+#include <iostream>
+#include <chrono>
+#include <thread>
+#include <atomic>
+#include <iomanip>
+#include <vector>
+#include <algorithm>
+#include <mutex>
+#include <limits>
+#include "../common/daemon.hpp"
+#include "../provider/provider_variorum.hpp"
+
+// Global variables for the monitoring function
+static VariorumProvider* g_variorum_provider = nullptr;
+static std::atomic<uint32_t> g_sample_count{0};
+static std::atomic<double> g_total_energy{0.0};
+static std::atomic<double> g_min_power{std::numeric_limits<double>::max()};
+static std::atomic<double> g_max_power{0.0};
+static std::vector<double> g_power_samples;
+static std::mutex g_samples_mutex;
+
+void fast_power_monitoring_function() {
+  if (!g_variorum_provider || !g_variorum_provider->is_initialized()) {
+    return;
+  }
+
+  double current_power = g_variorum_provider->get_total_power_usage();
+
+  // Update statistics atomically
+  g_sample_count.fetch_add(1);
+
+  // Accumulate energy (Power * Time)
+  // Since we sample every 20ms, energy increment = power * 0.02 seconds
+  double expected = g_total_energy.load();
+  while (!g_total_energy.compare_exchange_weak(
+      expected, expected + current_power * 0.02)) {
+    // Loop until successful update
+  }
+
+  // Update min power
+  double current_min = g_min_power.load();
+  while (current_power < current_min &&
+         !g_min_power.compare_exchange_weak(current_min, current_power)) {
+    // Loop until successful update
+  }
+
+  // Update max power
+  double current_max = g_max_power.load();
+  while (current_power > current_max &&
+         !g_max_power.compare_exchange_weak(current_max, current_power)) {
+    // Loop until successful update
+  }
+
+  // Store sample for statistical analysis (thread-safe)
+  {
+    std::lock_guard<std::mutex> lock(g_samples_mutex);
+    g_power_samples.push_back(current_power);
+  }
+}
+
+double calculate_standard_deviation(const std::vector<double>& samples,
+                                    double mean) {
+  if (samples.size() <= 1) return 0.0;
+
+  double sum_squared_diff = 0.0;
+  for (double sample : samples) {
+    double diff = sample - mean;
+    sum_squared_diff += diff * diff;
+  }
+
+  return std::sqrt(sum_squared_diff / (samples.size() - 1));
+}
+
+bool test_daemon_variorum_fast_integration() {
+  std::cout << "=== Fast Daemon + Variorum Integration Test (20ms sampling) ==="
+            << std::endl;
+
+  // Reset global counters
+  g_sample_count = 0;
+  g_total_energy = 0.0;
+  g_min_power    = std::numeric_limits<double>::max();
+  g_max_power    = 0.0;
+  g_power_samples.clear();
+
+  // Initialize Variorum provider
+  std::cout << "\n1. Initializing Variorum provider..." << std::endl;
+  VariorumProvider variorum_provider;
+  if (!variorum_provider.initialize()) {
+    std::cout << "ERROR: Failed to initialize Variorum provider" << std::endl;
+    return false;
+  }
+
+  g_variorum_provider = &variorum_provider;
+  std::cout << "SUCCESS: Variorum provider initialized with "
+            << variorum_provider.get_device_count() << " device(s)"
+            << std::endl;
+
+  // Create daemon with 20ms interval
+  std::cout << "\n2. Creating daemon with 20ms monitoring interval..."
+            << std::endl;
+  Daemon power_daemon(fast_power_monitoring_function, 20);
+
+  // Start monitoring
+  std::cout << "\n3. Starting fast power monitoring..." << std::endl;
+  power_daemon.start();
+  std::cout << "SUCCESS: Fast power monitoring started" << std::endl;
+
+  // Let it run for 2 seconds
+  std::cout << "\n4. Monitoring for 2 seconds (high frequency sampling)..."
+            << std::endl;
+  std::cout << "   (No real-time output to avoid saturation)" << std::endl;
+
+  auto start_time = std::chrono::high_resolution_clock::now();
+  std::this_thread::sleep_for(std::chrono::seconds(2));
+  auto end_time = std::chrono::high_resolution_clock::now();
+
+  // Stop monitoring
+  std::cout << "\n5. Stopping power monitoring..." << std::endl;
+  power_daemon.stop();
+  std::cout << "SUCCESS: Power monitoring stopped" << std::endl;
+
+  // Calculate actual monitoring duration
+  auto actual_duration = std::chrono::duration_cast<std::chrono::milliseconds>(
+      end_time - start_time);
+
+  // Analyze collected data
+  std::cout << "\n=== Statistical Analysis ===" << std::endl;
+
+  uint32_t total_samples = g_sample_count.load();
+  double total_energy    = g_total_energy.load();
+  double min_power       = g_min_power.load();
+  double max_power       = g_max_power.load();
+
+  std::cout << "Monitoring duration: " << actual_duration.count() << " ms"
+            << std::endl;
+  std::cout << "Total samples collected: " << total_samples << std::endl;
+  std::cout << "Expected samples (50 Hz): " << (actual_duration.count() / 20)
+            << std::endl;
+  std::cout << "Sampling efficiency: " << std::fixed << std::setprecision(1)
+            << (100.0 * total_samples / (actual_duration.count() / 20.0)) << "%"
+            << std::endl;
+
+  if (total_samples > 0) {
+    double avg_power = total_energy / (total_samples * 0.02);
+
+    std::cout << "\n=== Power Statistics ===" << std::endl;
+    std::cout << "Average power: " << std::fixed << std::setprecision(2)
+              << avg_power << " W" << std::endl;
+    std::cout << "Minimum power: " << std::fixed << std::setprecision(2)
+              << min_power << " W" << std::endl;
+    std::cout << "Maximum power: " << std::fixed << std::setprecision(2)
+              << max_power << " W" << std::endl;
+    std::cout << "Power range: " << std::fixed << std::setprecision(2)
+              << (max_power - min_power) << " W" << std::endl;
+    std::cout << "Total energy consumed: " << std::fixed << std::setprecision(3)
+              << total_energy << " J" << std::endl;
+
+    // Calculate additional statistics from stored samples
+    {
+      std::lock_guard<std::mutex> lock(g_samples_mutex);
+      if (!g_power_samples.empty()) {
+        std::sort(g_power_samples.begin(), g_power_samples.end());
+
+        size_t n = g_power_samples.size();
+        double median =
+            (n % 2 == 0)
+                ? (g_power_samples[n / 2 - 1] + g_power_samples[n / 2]) / 2.0
+                : g_power_samples[n / 2];
+
+        double q1 = g_power_samples[n / 4];
+        double q3 = g_power_samples[3 * n / 4];
+
+        double std_dev =
+            calculate_standard_deviation(g_power_samples, avg_power);
+
+        std::cout << "\n=== Extended Statistics ===" << std::endl;
+        std::cout << "Median power: " << std::fixed << std::setprecision(2)
+                  << median << " W" << std::endl;
+        std::cout << "Q1 (25th percentile): " << std::fixed
+                  << std::setprecision(2) << q1 << " W" << std::endl;
+        std::cout << "Q3 (75th percentile): " << std::fixed
+                  << std::setprecision(2) << q3 << " W" << std::endl;
+        std::cout << "Standard deviation: " << std::fixed
+                  << std::setprecision(2) << std_dev << " W" << std::endl;
+        std::cout << "Coefficient of variation: " << std::fixed
+                  << std::setprecision(1) << (100.0 * std_dev / avg_power)
+                  << "%" << std::endl;
+      }
+    }
+
+    // Show per-device breakdown if multiple devices
+    size_t device_count = variorum_provider.get_device_count();
+    if (device_count > 1) {
+      std::cout << "\n=== Per-Device Final Readings ===" << std::endl;
+      for (size_t i = 0; i < device_count; ++i) {
+        double device_power     = variorum_provider.get_device_power_usage(i);
+        std::string device_name = variorum_provider.get_device_name(i);
+        std::cout << "  " << device_name << ": " << std::fixed
+                  << std::setprecision(2) << device_power << " W" << std::endl;
+      }
+    }
+  }
+
+  // Cleanup
+  std::cout << "\n6. Cleaning up..." << std::endl;
+  g_variorum_provider = nullptr;
+  variorum_provider.finalize();
+  std::cout << "SUCCESS: Cleanup completed" << std::endl;
+
+  return true;
+}
+
+int main() {
+  try {
+    if (test_daemon_variorum_fast_integration()) {
+      std::cout << "\nFast integration test PASSED!" << std::endl;
+      return 0;
+    } else {
+      std::cout << "\nFast integration test FAILED!" << std::endl;
+      return 1;
+    }
+  } catch (const std::exception& e) {
+    std::cerr << "\nTest failed with exception: " << e.what() << std::endl;
+    return 1;
+  }
+}
\ No newline at end of file
diff --git a/profiling/energy-profiler/tests/daemon_variorum_integration_test.cpp b/profiling/energy-profiler/tests/daemon_variorum_integration_test.cpp
new file mode 100644
index 000000000..fbbb1dfb6
--- /dev/null
+++ b/profiling/energy-profiler/tests/daemon_variorum_integration_test.cpp
@@ -0,0 +1,128 @@
+#include <iostream>
+#include <chrono>
+#include <thread>
+#include <atomic>
+#include <iomanip>
+#include "../common/daemon.hpp"
+#include "../provider/provider_variorum.hpp"
+
+// Global variables for the monitoring function
+static VariorumProvider* g_variorum_provider = nullptr;
+static std::atomic<uint32_t> g_sample_count{0};
+static std::atomic<double> g_total_energy{0.0};
+static std::atomic<double> g_last_power{0.0};
+
+void power_monitoring_function() {
+  if (!g_variorum_provider || !g_variorum_provider->is_initialized()) {
+    std::cout << "ERROR: Variorum provider not initialized" << std::endl;
+    return;
+  }
+
+  double current_power = g_variorum_provider->get_total_power_usage();
+  g_last_power.store(current_power);
+
+  // Accumulate energy (Power * Time)
+  // Since we sample every 1000ms, energy increment = power * 1.0 seconds
+  double expected = g_total_energy.load();
+  while (!g_total_energy.compare_exchange_weak(
+      expected, expected + current_power * 1.0)) {
+    // Loop until successful update
+  }
+
+  uint32_t sample_num = g_sample_count.fetch_add(1) + 1;
+
+  std::cout << std::fixed << std::setprecision(2) << "Sample " << sample_num
+            << ": " << current_power
+            << " W (Total Energy: " << g_total_energy.load() << " J)"
+            << std::endl;
+
+  // Show individual device power if multiple devices
+  size_t device_count = g_variorum_provider->get_device_count();
+  if (device_count > 1) {
+    for (size_t i = 0; i < device_count; ++i) {
+      double device_power = g_variorum_provider->get_device_power_usage(i);
+      if (device_power >= 0.0) {
+        std::cout << "  " << g_variorum_provider->get_device_name(i) << ": "
+                  << device_power << " W" << std::endl;
+      }
+    }
+  }
+}
+
+bool test_daemon_variorum_integration() {
+  std::cout << "=== Daemon + Variorum Integration Test ===" << std::endl;
+
+  // Reset global counters
+  g_sample_count = 0;
+  g_total_energy = 0.0;
+  g_last_power   = 0.0;
+
+  // Initialize Variorum provider
+  std::cout << "\n1. Initializing Variorum provider..." << std::endl;
+  VariorumProvider variorum_provider;
+  if (!variorum_provider.initialize()) {
+    std::cout << "ERROR: Failed to initialize Variorum provider" << std::endl;
+    return false;
+  }
+
+  g_variorum_provider = &variorum_provider;
+  std::cout << "SUCCESS: Variorum provider initialized with "
+            << variorum_provider.get_device_count() << " device(s)"
+            << std::endl;
+
+  // Create daemon with 1-second interval
+  std::cout << "\n2. Creating daemon with 1-second monitoring interval..."
+            << std::endl;
+  Daemon power_daemon(power_monitoring_function, 1000);
+
+  // Start monitoring
+  std::cout << "\n3. Starting power monitoring..." << std::endl;
+  power_daemon.start();
+  std::cout << "SUCCESS: Power monitoring started" << std::endl;
+
+  // Let it run for 2 seconds
+  std::cout << "\n4. Monitoring for 2 seconds..." << std::endl;
+  std::this_thread::sleep_for(std::chrono::seconds(2));
+
+  // Stop monitoring
+  std::cout << "\n5. Stopping power monitoring..." << std::endl;
+  power_daemon.stop();
+  std::cout << "SUCCESS: Power monitoring stopped" << std::endl;
+
+  // Display final statistics
+  std::cout << "\n=== Final Statistics ===" << std::endl;
+  std::cout << "Total samples: " << g_sample_count.load() << std::endl;
+  std::cout << "Last power reading: " << std::fixed << std::setprecision(2)
+            << g_last_power.load() << " W" << std::endl;
+  std::cout << "Total energy consumed: " << std::fixed << std::setprecision(2)
+            << g_total_energy.load() << " J" << std::endl;
+
+  if (g_sample_count.load() > 0) {
+    double avg_power = g_total_energy.load() / (g_sample_count.load() * 1.0);
+    std::cout << "Average power: " << std::fixed << std::setprecision(2)
+              << avg_power << " W" << std::endl;
+  }
+
+  // Cleanup
+  std::cout << "\n6. Cleaning up..." << std::endl;
+  g_variorum_provider = nullptr;
+  variorum_provider.finalize();
+  std::cout << "SUCCESS: Cleanup completed" << std::endl;
+
+  return true;
+}
+
+int main() {
+  try {
+    if (test_daemon_variorum_integration()) {
+      std::cout << "\nIntegration test PASSED!" << std::endl;
+      return 0;
+    } else {
+      std::cout << "\nIntegration test FAILED!" << std::endl;
+      return 1;
+    }
+  } catch (const std::exception& e) {
+    std::cerr << "\nTest failed with exception: " << e.what() << std::endl;
+    return 1;
+  }
+}
\ No newline at end of file
diff --git a/profiling/energy-profiler/tests/nvml_provider_test.cpp b/profiling/energy-profiler/tests/nvml_provider_test.cpp
new file mode 100644
index 000000000..8f5d015ad
--- /dev/null
+++ b/profiling/energy-profiler/tests/nvml_provider_test.cpp
@@ -0,0 +1,100 @@
+#include <iostream>
+#include <chrono>
+#include <thread>
+#include "../provider/provider_nvml.hpp"
+
+void test_nvml_provider() {
+  std::cout << "=== NVML Provider Test ===" << std::endl;
+
+  NVMLProvider provider;
+
+  // Test initialization
+  std::cout << "\n1. Testing initialization..." << std::endl;
+  if (!provider.initialize()) {
+    std::cout << "ERROR: Failed to initialize NVML provider" << std::endl;
+    return;
+  }
+  std::cout << "SUCCESS: NVML provider initialized successfully" << std::endl;
+
+  // Test device discovery
+  std::cout << "\n2. Testing device discovery..." << std::endl;
+  size_t device_count = provider.get_device_count();
+  std::cout << "Found " << device_count << " device(s)" << std::endl;
+
+  if (device_count == 0) {
+    std::cout << "ERROR: No devices found" << std::endl;
+    return;
+  }
+
+  // Display device information
+  std::cout << "\n3. Device information:" << std::endl;
+  for (size_t i = 0; i < device_count; ++i) {
+    std::string name = provider.get_device_name(i);
+    std::cout << "  Device " << i << ": " << name << std::endl;
+  }
+
+  // Test power readings
+  std::cout << "\n4. Testing power readings..." << std::endl;
+  for (int sample = 0; sample < 5; ++sample) {
+    std::cout << "Sample " << (sample + 1) << ":" << std::endl;
+
+    // Individual device power
+    for (size_t i = 0; i < device_count; ++i) {
+      double power = provider.get_device_power_usage(i);
+      if (power >= 0.0) {
+        std::cout << "  Device " << i << ": " << power << " W" << std::endl;
+      } else {
+        std::cout << "  Device " << i << ": Failed to read power" << std::endl;
+      }
+    }
+
+    // Individual device direct power
+    for (size_t i = 0; i < device_count; ++i) {
+      double direct_power = provider.get_device_power_usage_direct(i);
+      if (direct_power >= 0.0) {
+        std::cout << "  Device " << i << " (Direct): " << direct_power << " W"
+                  << std::endl;
+      } else {
+        std::cout << "  Device " << i
+                  << " (Direct): Failed to read direct power" << std::endl;
+      }
+    }
+
+    // Current energy consumption
+    for (size_t i = 0; i < device_count; ++i) {
+      double energy = provider.get_current_energy_consumption(i);
+      if (energy >= 0.0) {
+        std::cout << "  Device " << i << " Energy: " << energy << " J"
+                  << std::endl;
+      } else {
+        std::cout << "  Device " << i << " Energy: Failed to read energy"
+                  << std::endl;
+      }
+    }
+
+    // Total power
+    double total_power = provider.get_total_power_usage();
+    std::cout << "  Total Power: " << total_power << " W" << std::endl;
+
+    if (sample < 4) {
+      std::this_thread::sleep_for(std::chrono::milliseconds(100));
+    }
+  }
+
+  // Test finalization
+  std::cout << "\n5. Testing finalization..." << std::endl;
+  provider.finalize();
+  std::cout << "SUCCESS: NVML provider finalized successfully" << std::endl;
+
+  std::cout << "\n=== Test Completed ===" << std::endl;
+}
+
+int main() {
+  try {
+    test_nvml_provider();
+    return 0;
+  } catch (const std::exception& e) {
+    std::cerr << "ERROR: Test failed with exception: " << e.what() << std::endl;
+    return 1;
+  }
+}
\ No newline at end of file
diff --git a/profiling/energy-profiler/tests/timer_test.cpp b/profiling/energy-profiler/tests/timer_test.cpp
new file mode 100644
index 000000000..61a9a3cc2
--- /dev/null
+++ b/profiling/energy-profiler/tests/timer_test.cpp
@@ -0,0 +1,343 @@
+#include <iostream>
+#include <chrono>
+#include <thread>
+#include <cassert>
+#include "../common/timer.hpp"
+
+// Test helper function to check if a value is within expected range
+bool is_within_range(uint64_t actual, uint64_t expected, uint64_t tolerance) {
+  return (actual >= expected - tolerance) && (actual <= expected + tolerance);
+}
+
+bool test_basic_timing() {
+  std::cout << "=== Test Basic Timing ===" << std::endl;
+
+  EnergyTimer timer;
+
+  // Test single timing
+  timer.start_timing(1, RegionType::ParallelFor, "test_kernel");
+  std::this_thread::sleep_for(std::chrono::milliseconds(2));
+  timer.end_timing(1);
+
+  auto& timings = timer.get_timings();
+  if (timings.size() != 1) {
+    std::cout << "ERROR: Expected 1 timing, got " << timings.size()
+              << std::endl;
+    return false;
+  }
+
+  auto& timing = timings[1];
+  if (!timing.is_ended()) {
+    std::cout << "ERROR: Timing should be ended" << std::endl;
+    return false;
+  }
+
+  uint64_t duration = timing.get_duration_ms();
+  if (!is_within_range(duration, 2, 2)) {  // 2ms ± 2ms tolerance
+    std::cout << "ERROR: Duration should be ~2ms, got " << duration << "ms"
+              << std::endl;
+    return false;
+  }
+
+  if (timing.name_ != "test_kernel") {
+    std::cout << "ERROR: Wrong name, expected 'test_kernel', got '"
+              << timing.name_ << "'" << std::endl;
+    return false;
+  }
+
+  if (timing.region_type_ != RegionType::ParallelFor) {
+    std::cout << "ERROR: Wrong region type" << std::endl;
+    return false;
+  }
+
+  std::cout << "SUCCESS: Basic timing works correctly (duration: " << duration
+            << "ms)" << std::endl;
+  return true;
+}
+
+bool test_multiple_timings() {
+  std::cout << "\n=== Test Multiple Timings ===" << std::endl;
+
+  EnergyTimer timer;
+
+  // Start multiple timings
+  timer.start_timing(1, RegionType::ParallelFor, "kernel_1");
+  timer.start_timing(2, RegionType::ParallelReduce, "kernel_2");
+  timer.start_timing(3, RegionType::UserRegion, "region_1");
+
+  std::this_thread::sleep_for(std::chrono::milliseconds(1));
+  timer.end_timing(1);
+
+  std::this_thread::sleep_for(std::chrono::milliseconds(2));
+  timer.end_timing(2);
+
+  std::this_thread::sleep_for(std::chrono::milliseconds(2));
+  timer.end_timing(3);
+
+  auto& timings = timer.get_timings();
+  if (timings.size() != 3) {
+    std::cout << "ERROR: Expected 3 timings, got " << timings.size()
+              << std::endl;
+    return false;
+  }
+
+  // Check individual durations
+  uint64_t duration1 = timings[1].get_duration_ms();
+  uint64_t duration2 = timings[2].get_duration_ms();
+  uint64_t duration3 = timings[3].get_duration_ms();
+
+  if (!is_within_range(duration1, 1, 1)) {
+    std::cout << "ERROR: Duration1 should be ~1ms, got " << duration1 << "ms"
+              << std::endl;
+    return false;
+  }
+
+  if (!is_within_range(duration2, 3, 2)) {  // 1 + 2 = 3ms
+    std::cout << "ERROR: Duration2 should be ~3ms, got " << duration2 << "ms"
+              << std::endl;
+    return false;
+  }
+
+  if (!is_within_range(duration3, 5, 2)) {  // 1 + 2 + 2 = 5ms
+    std::cout << "ERROR: Duration3 should be ~5ms, got " << duration3 << "ms"
+              << std::endl;
+    return false;
+  }
+
+  // Check that duration2 > duration1 and duration3 > duration2
+  if (duration2 <= duration1) {
+    std::cout << "ERROR: Duration2 should be greater than duration1"
+              << std::endl;
+    return false;
+  }
+
+  if (duration3 <= duration2) {
+    std::cout << "ERROR: Duration3 should be greater than duration2"
+              << std::endl;
+    return false;
+  }
+
+  std::cout << "SUCCESS: Multiple timings work correctly" << std::endl;
+  std::cout << "  Duration1: " << duration1 << "ms" << std::endl;
+  std::cout << "  Duration2: " << duration2 << "ms" << std::endl;
+  std::cout << "  Duration3: " << duration3 << "ms" << std::endl;
+  return true;
+}
+
+bool test_region_types() {
+  std::cout << "\n=== Test Region Types ===" << std::endl;
+
+  EnergyTimer timer;
+
+  // Test all region types
+  timer.start_timing(1, RegionType::ParallelFor, "parallel_for");
+  timer.start_timing(2, RegionType::ParallelScan, "parallel_scan");
+  timer.start_timing(3, RegionType::ParallelReduce, "parallel_reduce");
+  timer.start_timing(4, RegionType::DeepCopy, "deep_copy");
+  timer.start_timing(5, RegionType::UserRegion, "user_region");
+  timer.start_timing(6, RegionType::Unknown, "unknown_op");
+
+  std::this_thread::sleep_for(std::chrono::milliseconds(1));
+
+  timer.end_timing(1);
+  timer.end_timing(2);
+  timer.end_timing(3);
+  timer.end_timing(4);
+  timer.end_timing(5);
+  timer.end_timing(6);
+
+  auto& timings = timer.get_timings();
+  if (timings.size() != 6) {
+    std::cout << "ERROR: Expected 6 timings, got " << timings.size()
+              << std::endl;
+    return false;
+  }
+
+  // Verify region types
+  if (timings[1].region_type_ != RegionType::ParallelFor ||
+      timings[2].region_type_ != RegionType::ParallelScan ||
+      timings[3].region_type_ != RegionType::ParallelReduce ||
+      timings[4].region_type_ != RegionType::DeepCopy ||
+      timings[5].region_type_ != RegionType::UserRegion ||
+      timings[6].region_type_ != RegionType::Unknown) {
+    std::cout << "ERROR: Region types not correctly set" << std::endl;
+    return false;
+  }
+
+  // Verify names
+  if (timings[1].name_ != "parallel_for" ||
+      timings[2].name_ != "parallel_scan" ||
+      timings[3].name_ != "parallel_reduce" ||
+      timings[4].name_ != "deep_copy" || timings[5].name_ != "user_region" ||
+      timings[6].name_ != "unknown_op") {
+    std::cout << "ERROR: Names not correctly set" << std::endl;
+    return false;
+  }
+
+  std::cout << "SUCCESS: All region types work correctly" << std::endl;
+  return true;
+}
+
+bool test_error_handling() {
+  std::cout << "\n=== Test Error Handling ===" << std::endl;
+
+  EnergyTimer timer;
+
+  // Test ending non-existent timing (should not crash)
+  timer.end_timing(999);  // This should not crash
+
+  // Test getting duration before ending
+  timer.start_timing(1, RegionType::ParallelFor, "test");
+  auto& timings = timer.get_timings();
+
+  if (timings[1].is_ended()) {
+    std::cout << "ERROR: Timing should not be ended yet" << std::endl;
+    return false;
+  }
+
+  // End the timing
+  timer.end_timing(1);
+
+  if (!timings[1].is_ended()) {
+    std::cout << "ERROR: Timing should be ended now" << std::endl;
+    return false;
+  }
+
+  // Test ending the same timing twice (should not crash)
+  timer.end_timing(1);
+
+  std::cout << "SUCCESS: Error handling works correctly" << std::endl;
+  return true;
+}
+
+bool test_precision() {
+  std::cout << "\n=== Test Precision ===" << std::endl;
+
+  EnergyTimer timer;
+
+  // Test very short timing (should be 0 or 1 ms)
+  timer.start_timing(1, RegionType::ParallelFor, "short_op");
+  // No sleep - immediate end
+  timer.end_timing(1);
+
+  auto& timings           = timer.get_timings();
+  uint64_t short_duration = timings[1].get_duration_ms();
+
+  if (short_duration > 2) {  // Should be very small
+    std::cout << "WARNING: Short duration is " << short_duration
+              << "ms (expected ≤2ms)" << std::endl;
+  }
+
+  // Test longer timing for better precision
+  timer.start_timing(2, RegionType::ParallelFor, "long_op");
+  std::this_thread::sleep_for(std::chrono::milliseconds(10));
+  timer.end_timing(2);
+
+  uint64_t long_duration = timings[2].get_duration_ms();
+
+  if (!is_within_range(long_duration, 10, 5)) {
+    std::cout << "ERROR: Long duration should be ~10ms, got " << long_duration
+              << "ms" << std::endl;
+    return false;
+  }
+
+  std::cout << "SUCCESS: Precision test passed" << std::endl;
+  std::cout << "  Short duration: " << short_duration << "ms" << std::endl;
+  std::cout << "  Long duration: " << long_duration << "ms" << std::endl;
+  return true;
+}
+
+bool test_concurrent_timings() {
+  std::cout << "\n=== Test Concurrent Timings ===" << std::endl;
+
+  EnergyTimer timer;
+
+  // Start overlapping timings
+  timer.start_timing(1, RegionType::ParallelFor, "outer");
+  std::this_thread::sleep_for(std::chrono::milliseconds(1));
+
+  timer.start_timing(2, RegionType::ParallelReduce, "inner");
+  std::this_thread::sleep_for(std::chrono::milliseconds(2));
+  timer.end_timing(2);  // End inner first
+
+  std::this_thread::sleep_for(std::chrono::milliseconds(1));
+  timer.end_timing(1);  // End outer last
+
+  auto& timings           = timer.get_timings();
+  uint64_t outer_duration = timings[1].get_duration_ms();
+  uint64_t inner_duration = timings[2].get_duration_ms();
+
+  // Outer should be longer than inner
+  if (outer_duration <= inner_duration) {
+    std::cout << "ERROR: Outer duration (" << outer_duration
+              << "ms) should be greater than inner duration (" << inner_duration
+              << "ms)" << std::endl;
+    return false;
+  }
+
+  // Check approximate durations
+  if (!is_within_range(inner_duration, 2, 2)) {
+    std::cout << "ERROR: Inner duration should be ~2ms, got " << inner_duration
+              << "ms" << std::endl;
+    return false;
+  }
+
+  if (!is_within_range(outer_duration, 4, 2)) {  // 1 + 2 + 1 = 4ms
+    std::cout << "ERROR: Outer duration should be ~4ms, got " << outer_duration
+              << "ms" << std::endl;
+    return false;
+  }
+
+  std::cout << "SUCCESS: Concurrent timings work correctly" << std::endl;
+  std::cout << "  Outer duration: " << outer_duration << "ms" << std::endl;
+  std::cout << "  Inner duration: " << inner_duration << "ms" << std::endl;
+  return true;
+}
+
+bool very_long_timing() {
+  std::cout << "\n=== Test Very Long Timing ===" << std::endl;
+
+  EnergyTimer timer;
+
+  timer.start_timing(1, RegionType::ParallelFor, "very_long_op");
+  std::this_thread::sleep_for(
+      std::chrono::milliseconds(50));  // Sleep for 50ms instead of 1 second
+  timer.end_timing(1);
+
+  auto& timings     = timer.get_timings();
+  uint64_t duration = timings[1].get_duration_ms();
+
+  if (!is_within_range(duration, 50, 10)) {  // Allow some margin of error
+    std::cout << "ERROR: Duration should be ~50ms, got " << duration << "ms"
+              << std::endl;
+    return false;
+  }
+
+  std::cout << "SUCCESS: Very long timing works correctly (duration: "
+            << duration << "ms)" << std::endl;
+  return true;
+}
+
+int main() {
+  std::cout << "Running EnergyTimer Tests..." << std::endl;
+  std::cout << "=============================" << std::endl;
+
+  bool all_passed = true;
+
+  all_passed &= test_basic_timing();
+  all_passed &= test_multiple_timings();
+  all_passed &= test_region_types();
+  all_passed &= test_error_handling();
+  all_passed &= test_precision();
+  all_passed &= test_concurrent_timings();
+  all_passed &= very_long_timing();
+
+  std::cout << "\n=============================" << std::endl;
+  if (all_passed) {
+    std::cout << "ALL TESTS PASSED!" << std::endl;
+    return 0;
+  } else {
+    std::cout << "SOME TESTS FAILED!" << std::endl;
+    return 1;
+  }
+}
\ No newline at end of file
diff --git a/profiling/energy-profiler/tests/variorum_provider_test.cpp b/profiling/energy-profiler/tests/variorum_provider_test.cpp
new file mode 100644
index 000000000..8dae4a4a5
--- /dev/null
+++ b/profiling/energy-profiler/tests/variorum_provider_test.cpp
@@ -0,0 +1,77 @@
+#include <iostream>
+#include <chrono>
+#include <thread>
+#include "../provider/provider_variorum.hpp"
+
+void test_variorum_provider() {
+  std::cout << "=== Variorum Provider Test ===" << std::endl;
+
+  VariorumProvider provider;
+
+  // Test initialization
+  std::cout << "\n1. Testing initialization..." << std::endl;
+  if (!provider.initialize()) {
+    std::cout << "ERROR: Failed to initialize Variorum provider" << std::endl;
+    return;
+  }
+  std::cout << "SUCCESS: Variorum provider initialized successfully"
+            << std::endl;
+
+  // Test device discovery
+  std::cout << "\n2. Testing device discovery..." << std::endl;
+  size_t device_count = provider.get_device_count();
+  std::cout << "Found " << device_count << " device(s)" << std::endl;
+
+  if (device_count == 0) {
+    std::cout << "ERROR: No devices found" << std::endl;
+    return;
+  }
+
+  // Display device information
+  std::cout << "\n3. Device information:" << std::endl;
+  for (size_t i = 0; i < device_count; ++i) {
+    std::string name = provider.get_device_name(i);
+    std::cout << "  Device " << i << ": " << name << std::endl;
+  }
+
+  // Test power readings
+  std::cout << "\n4. Testing power readings..." << std::endl;
+  for (int sample = 0; sample < 5; ++sample) {
+    std::cout << "Sample " << (sample + 1) << ":" << std::endl;
+
+    // Individual device power
+    for (size_t i = 0; i < device_count; ++i) {
+      double power = provider.get_device_power_usage(i);
+      if (power >= 0.0) {
+        std::cout << "  Device " << i << ": " << power << " W" << std::endl;
+      } else {
+        std::cout << "  Device " << i << ": Failed to read power" << std::endl;
+      }
+    }
+
+    // Total power
+    double total_power = provider.get_total_power_usage();
+    std::cout << "  Total Power: " << total_power << " W" << std::endl;
+
+    if (sample < 4) {
+      std::this_thread::sleep_for(std::chrono::milliseconds(100));
+    }
+  }
+
+  // Test finalization
+  std::cout << "\n5. Testing finalization..." << std::endl;
+  provider.finalize();
+  std::cout << "SUCCESS: Variorum provider finalized successfully" << std::endl;
+
+  std::cout << "\n=== Test Completed ===" << std::endl;
+}
+
+int main() {
+  try {
+    test_variorum_provider();
+    return 0;
+  } catch (const std::exception& e) {
+    std::cerr << "ERROR: Test failed with exception: " << e.what() << std::endl;
+    return 1;
+  }
+}
\ No newline at end of file
diff --git a/profiling/energy-profiler/tools/kernel_timer_tool.cpp b/profiling/energy-profiler/tools/kernel_timer_tool.cpp
new file mode 100644
index 000000000..ea0895fbc
--- /dev/null
+++ b/profiling/energy-profiler/tools/kernel_timer_tool.cpp
@@ -0,0 +1,112 @@
+#include "kernel_timer_tool.hpp"
+#include <iostream>
+#include <chrono>
+
+void KernelTimerTool::init_library(
+    const int loadSeq, const uint64_t interfaceVer, const uint32_t devInfoCount,
+    Kokkos_Profiling_KokkosPDeviceInfo* deviceInfo) {
+  (void)devInfoCount;
+  (void)deviceInfo;
+  std::cout << "Kokkos Power Profiler: Initializing with load sequence "
+            << loadSeq << " and interface version " << interfaceVer
+            << std::endl;
+  std::cout << "Kokkos Power Profiler: Library initialized" << std::endl;
+}
+
+void KernelTimerTool::finalize_library() {
+  // Implementation is empty
+}
+
+void KernelTimerTool::start_region(const std::string& name, RegionType type,
+                                   uint64_t id) {
+  TimingInfo region;
+  region.name       = name;
+  region.type       = type;
+  region.start_time = std::chrono::high_resolution_clock::now();
+  region.id         = id;
+  active_regions_.push_back(region);
+}
+
+void KernelTimerTool::end_region() {
+  if (!active_regions_.empty()) {
+    auto region = active_regions_.back();
+    active_regions_.pop_back();
+    region.end_time = std::chrono::high_resolution_clock::now();
+    region.duration = std::chrono::duration_cast<std::chrono::nanoseconds>(
+        region.end_time - region.start_time);
+    if (region.type == RegionType::UserRegion)
+      completed_regions_.push_back(region);
+    else if (region.type == RegionType::DeepCopy)
+      completed_deepcopies_.push_back(region);
+    else
+      completed_kernels_.push_back(region);
+  }
+}
+
+void KernelTimerTool::begin_parallel_for(const char* name, const uint32_t devID,
+                                         uint64_t kID) {
+  (void)devID;
+  start_region(name, RegionType::ParallelFor, kID);
+}
+
+void KernelTimerTool::end_parallel_for(uint64_t kID) { 
+  (void)kID;
+  end_region(); 
+}
+
+void KernelTimerTool::begin_parallel_scan(const char* name,
+                                          const uint32_t devID, uint64_t* kID) {
+  (void)devID;
+  start_region(name, RegionType::ParallelScan, *kID);
+}
+
+void KernelTimerTool::end_parallel_scan(uint64_t kID) { 
+  (void)kID;
+  end_region(); 
+}
+
+void KernelTimerTool::begin_parallel_reduce(const char* name,
+                                            const uint32_t devID,
+                                            uint64_t* kID) {
+  (void)devID;
+  start_region(name, RegionType::ParallelReduce, *kID);
+}
+
+void KernelTimerTool::end_parallel_reduce(uint64_t kID) { 
+  (void)kID;
+  end_region(); 
+}
+
+void KernelTimerTool::begin_deep_copy(Kokkos::Tools::SpaceHandle dst_handle,
+                                      const char* dst_name, const void* dst_ptr,
+                                      Kokkos::Tools::SpaceHandle src_handle,
+                                      const char* src_name, const void* src_ptr,
+                                      uint64_t size) {
+  (void)dst_handle;
+  (void)src_handle;
+  (void)src_name;
+  (void)src_ptr;
+  (void)size;
+  start_region(dst_name, RegionType::DeepCopy,
+               reinterpret_cast<uint64_t>(dst_ptr));
+}
+
+void KernelTimerTool::end_deep_copy() { end_region(); }
+
+void KernelTimerTool::push_profile_region(const char* region_name) {
+  start_region(region_name, RegionType::UserRegion, next_region_id_++);
+}
+
+void KernelTimerTool::pop_profile_region() { end_region(); }
+
+const std::deque<TimingInfo>& KernelTimerTool::get_kernel_timings() const {
+  return completed_kernels_;
+}
+
+const std::deque<TimingInfo>& KernelTimerTool::get_region_timings() const {
+  return completed_regions_;
+}
+
+const std::deque<TimingInfo>& KernelTimerTool::get_deep_copy_timings() const {
+  return completed_deepcopies_;
+}
diff --git a/profiling/energy-profiler/tools/kernel_timer_tool.hpp b/profiling/energy-profiler/tools/kernel_timer_tool.hpp
new file mode 100644
index 000000000..1dd1f0963
--- /dev/null
+++ b/profiling/energy-profiler/tools/kernel_timer_tool.hpp
@@ -0,0 +1,56 @@
+#pragma once
+
+#include <string>
+#include <deque>
+#include "../common/tool_interface.hpp"
+#include "../common/timer.hpp"
+
+class KernelTimerTool : public ToolInterface {
+ public:
+  KernelTimerTool()           = default;
+  ~KernelTimerTool() override = default;
+
+  void init_library(const int loadSeq, const uint64_t interfaceVer,
+                    const uint32_t devInfoCount,
+                    Kokkos_Profiling_KokkosPDeviceInfo* deviceInfo) override;
+  void finalize_library() override;
+
+  // Stack-based timing for robust region/kernel tracking
+  void start_region(const std::string& name, RegionType type, uint64_t id = 0);
+  void end_region();
+
+  // Kokkos interface
+  void begin_parallel_for(const char* name, const uint32_t devID,
+                          uint64_t kID) override;
+  void end_parallel_for(uint64_t kID) override;
+
+  void begin_parallel_scan(const char* name, const uint32_t devID,
+                           uint64_t* kID) override;
+  void end_parallel_scan(uint64_t kID) override;
+
+  void begin_parallel_reduce(const char* name, const uint32_t devID,
+                             uint64_t* kID) override;
+  void end_parallel_reduce(uint64_t kID) override;
+
+  void begin_deep_copy(Kokkos::Tools::SpaceHandle dst_handle,
+                       const char* dst_name, const void* dst_ptr,
+                       Kokkos::Tools::SpaceHandle src_handle,
+                       const char* src_name, const void* src_ptr,
+                       uint64_t size) override;
+  void end_deep_copy() override;
+
+  void push_profile_region(const char* region_name) override;
+  void pop_profile_region() override;
+
+  // Getters for summary
+  const std::deque<TimingInfo>& get_kernel_timings() const;
+  const std::deque<TimingInfo>& get_region_timings() const;
+  const std::deque<TimingInfo>& get_deep_copy_timings() const;
+
+ private:
+  std::deque<TimingInfo> active_regions_;
+  std::deque<TimingInfo> completed_kernels_;
+  std::deque<TimingInfo> completed_regions_;
+  std::deque<TimingInfo> completed_deepcopies_;
+  uint64_t next_region_id_ = 1;
+};
\ No newline at end of file