diff --git a/CMakeLists.txt b/CMakeLists.txt index bc3b1e63b..87ad5ea53 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -103,6 +103,23 @@ else() set(KOKKOSTOOLS_HAS_VTUNE OFF) endif() +# Check for NVML (required for energy profiler) +set(MIN_CUDA_VERSION 12.6) +find_package(CUDAToolkit ${MIN_CUDA_VERSION} QUIET) +if (CUDAToolkit_FOUND) + find_package(CUDA::nvml QUIET) + if(TARGET CUDA::nvml) + message(STATUS "Found CUDA NVML (version ${CUDAToolkit_VERSION}), energy profiler will be built") + set(KOKKOSTOOLS_HAS_NVML ON) + else() + message(STATUS "CUDA::nvml target not found, energy profiler will be skipped") + set(KOKKOSTOOLS_HAS_NVML OFF) + endif() +else() + message(STATUS "CUDAToolkit ${MIN_CUDA_VERSION} or higher not found, energy profiler will be skipped") + set(KOKKOSTOOLS_HAS_NVML OFF) +endif() + # make Kokkos profiling interface available for native profilers include_directories(${CMAKE_CURRENT_SOURCE_DIR}/profiling/all) @@ -150,6 +167,11 @@ if(NOT WIN32) add_subdirectory(profiling/chrome-tracing) add_subdirectory(profiling/space-time-stack) add_subdirectory(profiling/perfetto-connector) + if(KOKKOSTOOLS_HAS_NVML) + add_subdirectory(profiling/energy-profiler) + else() + message(STATUS "Skipping energy-profiler (NVML not available)") + endif() endif() # External lib connectors diff --git a/profiling/energy-profiler/CMakeLists.txt b/profiling/energy-profiler/CMakeLists.txt new file mode 100644 index 000000000..70d2cc417 --- /dev/null +++ b/profiling/energy-profiler/CMakeLists.txt @@ -0,0 +1,11 @@ +kp_add_library(kp_energy_profiler + kp_energy_profiler.cpp + timing_utils.cpp + timing_export.cpp + nvml_provider.cpp + power_sampler.cpp + daemon.cpp +) + +target_link_libraries(kp_energy_profiler PRIVATE CUDA::nvml) +target_compile_definitions(kp_energy_profiler PRIVATE KOKKOS_ENERGY_PROFILER_HAS_NVML) diff --git a/profiling/energy-profiler/daemon.cpp b/profiling/energy-profiler/daemon.cpp new file mode 100644 index 000000000..1884c234d --- /dev/null +++ b/profiling/energy-profiler/daemon.cpp @@ -0,0 +1,45 @@ +//@HEADER +// ************************************************************************ +// +// Kokkos v. 4.0 +// Copyright (2022) National Technology & Engineering +// Solutions of Sandia, LLC (NTESS). +// +// Under the terms of Contract DE-NA0003525 with NTESS, +// the U.S. Government retains certain rights in this software. +// +// Part of Kokkos, under the Apache License v2.0 with LLVM Exceptions. +// See https://kokkos.org/LICENSE for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//@HEADER + +#include "daemon.hpp" +#include +#include + +void Daemon::start() { + if (!running_) { + running_ = true; + thread_ = std::thread(&Daemon::run, this); + } else { + throw std::runtime_error("Daemon already started"); + } +} + +void Daemon::run() { + while (running_) { + auto next_run = std::chrono::high_resolution_clock::now() + interval_; + func_(); + std::this_thread::sleep_until(next_run); + } +} + +void Daemon::stop() { + if (running_) { + running_ = false; + thread_.join(); + } else { + throw std::runtime_error("Daemon not started"); + } +} diff --git a/profiling/energy-profiler/daemon.hpp b/profiling/energy-profiler/daemon.hpp new file mode 100644 index 000000000..f99724198 --- /dev/null +++ b/profiling/energy-profiler/daemon.hpp @@ -0,0 +1,39 @@ +//@HEADER +// ************************************************************************ +// +// Kokkos v. 4.0 +// Copyright (2022) National Technology & Engineering +// Solutions of Sandia, LLC (NTESS). +// +// Under the terms of Contract DE-NA0003525 with NTESS, +// the U.S. Government retains certain rights in this software. +// +// Part of Kokkos, under the Apache License v2.0 with LLVM Exceptions. +// See https://kokkos.org/LICENSE for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//@HEADER + +#pragma once + +#include +#include +#include + +class Daemon { + public: + Daemon(std::function func, int interval_ms) + : interval_(interval_ms), func_(func){}; + + void start(); + void run(); + void stop(); + bool is_running() const { return running_; } + std::thread& get_thread() { return thread_; } + + private: + std::chrono::milliseconds interval_; + bool running_{false}; + std::function func_; + std::thread thread_; +}; diff --git a/profiling/energy-profiler/energy_profiler_constants.hpp b/profiling/energy-profiler/energy_profiler_constants.hpp new file mode 100644 index 000000000..3f5d69eee --- /dev/null +++ b/profiling/energy-profiler/energy_profiler_constants.hpp @@ -0,0 +1,38 @@ +//@HEADER +// ************************************************************************ +// +// Kokkos v. 4.0 +// Copyright (2022) National Technology & Engineering +// Solutions of Sandia, LLC (NTESS). +// +// Under the terms of Contract DE-NA0003525 with NTESS, +// the U.S. Government retains certain rights in this software. +// +// Part of Kokkos, under the Apache License v2.0 with LLVM Exceptions. +// See https://kokkos.org/LICENSE for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//@HEADER + +#pragma once + +#include + +namespace KokkosTools { +namespace EnergyProfiler { + +// Sampling interval in milliseconds +constexpr int SAMPLING_INTERVAL_MS = 20; + +// Buffer size for hostname +const size_t HOSTNAME_BUFFER_SIZE = 256; + +// Table formatting constants for timing export +const int COLUMN_WIDTH_CATEGORY = 10; +const int COLUMN_WIDTH_NAME = 32; +const int COLUMN_WIDTH_TYPE = 14; +const int COLUMN_WIDTH_TIME = 17; +const int COLUMN_WIDTH_DURATION = 13; + +} // namespace EnergyProfiler +} // namespace KokkosTools diff --git a/profiling/energy-profiler/kp_energy_profiler.cpp b/profiling/energy-profiler/kp_energy_profiler.cpp new file mode 100644 index 000000000..2757a56e3 --- /dev/null +++ b/profiling/energy-profiler/kp_energy_profiler.cpp @@ -0,0 +1,350 @@ +//@HEADER +// ************************************************************************ +// +// Kokkos v. 4.0 +// Copyright (2022) National Technology & Engineering +// Solutions of Sandia, LLC (NTESS). +// +// Under the terms of Contract DE-NA0003525 with NTESS, +// the U.S. Government retains certain rights in this software. +// +// Part of Kokkos, under the Apache License v2.0 with LLVM Exceptions. +// See https://kokkos.org/LICENSE for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//@HEADER + +#include +#include +#include +#include +#include +#include +#include + +#include "kp_core.hpp" +#include "timing_utils.hpp" +#include "timing_export.hpp" +#include "power_sampler.hpp" + +namespace KokkosTools { +namespace EnergyProfiler { + +// Global power sampler instance (completely decoupled from timing) +static std::unique_ptr g_power_sampler; + +// Helper function to generate new region ID +uint64_t generate_new_region_id() { + auto& state = EnergyProfilerState::get_instance(); + std::lock_guard lock(state.get_mutex()); + uint64_t current_id = state.get_next_region_id(); + state.increment_next_region_id(); + return current_id; +} + +// Helper function for verbose logging +void log_verbose(const std::string& message) { + if (EnergyProfilerState::get_instance().get_verbose_enabled()) { + std::cout << message << std::endl; + } +} + +// Start a region +void start_region(const std::string& name, RegionType type, uint64_t id) { + try { + TimingInfo region; + region.name = name; + region.type = type; + region.start_time = std::chrono::high_resolution_clock::now(); + region.id = id; + auto& state = EnergyProfilerState::get_instance(); + std::lock_guard lock(state.get_mutex()); + state.get_active_regions().push_back(region); + } catch (const std::exception& e) { + std::cerr << "EnergyProfiler ERROR: Error in start_region: " + << std::string(e.what()) << std::endl; + } +} + +// End last region of given type +void end_region_by_type(RegionType type_to_end) { + try { + auto& state = EnergyProfilerState::get_instance(); + std::lock_guard lock(state.get_mutex()); + auto& active_regions = state.get_active_regions(); + if (active_regions.empty()) return; + auto it = std::find_if(active_regions.rbegin(), active_regions.rend(), + [type_to_end](const TimingInfo& region) { + return region.type == type_to_end; + }); + if (it != active_regions.rend()) { + auto region = *it; + active_regions.erase(std::next(it).base()); + region.end_time = std::chrono::high_resolution_clock::now(); + state.get_completed_timings().push_back(region); + } + } catch (const std::exception& e) { + std::cerr << "EnergyProfiler ERROR: Error in end_region_by_type: " + << std::string(e.what()) << std::endl; + } +} + +// End region by id +void end_region_with_id(uint64_t expected_id) { + try { + auto end_time = std::chrono::high_resolution_clock::now(); + auto& state = EnergyProfilerState::get_instance(); + std::lock_guard lock(state.get_mutex()); + auto& active_regions = state.get_active_regions(); + auto it = std::find_if(active_regions.begin(), active_regions.end(), + [expected_id](const TimingInfo& region) { + return region.id == expected_id; + }); + if (it != active_regions.end()) { + auto region = *it; + region.end_time = end_time; + active_regions.erase(it); + state.get_completed_timings().push_back(region); + } else { + std::cerr + << "EnergyProfiler ERROR: Warning: No active region found with ID " + << std::to_string(expected_id) << std::endl; + } + } catch (const std::exception& e) { + std::cerr << "EnergyProfiler ERROR: Error in end_region_with_id: " + << std::string(e.what()) << std::endl; + } +} + +// Get all completed timings +std::vector get_all_timings() { + try { + auto& state = EnergyProfilerState::get_instance(); + std::lock_guard lock(state.get_mutex()); + std::vector all_timings = state.get_completed_timings(); + // Sort by start time; called only once at end, so O(n log n) is acceptable + std::sort(all_timings.begin(), all_timings.end(), + [](const TimingInfo& a, const TimingInfo& b) { + return a.start_time < b.start_time; + }); + return all_timings; + } catch (const std::exception& e) { + std::cerr << "EnergyProfiler ERROR: Error in get_all_timings: " + << std::string(e.what()) << std::endl; + return {}; + } +} + +} // namespace EnergyProfiler +} // namespace KokkosTools + +extern "C" { + +// Tool settings +void kokkosp_request_tool_settings(const uint32_t, + Kokkos_Tools_ToolSettings* settings) { + settings->requires_global_fencing = false; + settings->padding[0] = 0; +} + +// Library init +void kokkosp_init_library(const int loadSeq, const uint64_t interfaceVer, + const uint32_t devInfoCount, + Kokkos_Profiling_KokkosPDeviceInfo* deviceInfo) { + (void)devInfoCount; + (void)deviceInfo; + const char* verbose_env = std::getenv("KOKKOS_TOOLS_ENERGY_VERBOSE"); + if (verbose_env && + (std::string(verbose_env) == "1" || std::string(verbose_env) == "ON")) { + KokkosTools::EnergyProfiler::EnergyProfilerState::get_instance() + .set_verbose_enabled(true); + } + std::cout << "Kokkos Energy Profiler: Initializing with load sequence " + << loadSeq << " and interface version " << interfaceVer + << std::endl; + std::cout << "Kokkos Energy Profiler: Library initialized" << std::endl; + + // Initialize power sampling (completely independent of timing) + KokkosTools::EnergyProfiler::g_power_sampler.reset( + new KokkosTools::EnergyProfiler::PowerSampler()); + if (KokkosTools::EnergyProfiler::g_power_sampler->initialize()) { + KokkosTools::EnergyProfiler::g_power_sampler->start_sampling(); + std::cout << "Kokkos Energy Profiler: Power sampling started" << std::endl; + } else { + std::cout << "Kokkos Energy Profiler: Power sampling initialization failed" + << std::endl; + } +} + +// Library finalize +void kokkosp_finalize_library() { + std::cout << "Kokkos Energy Profiler: Finalizing library" << std::endl; + + std::string prefix = KokkosTools::EnergyProfiler::generate_prefix(); + auto all_timings = KokkosTools::EnergyProfiler::get_all_timings(); + KokkosTools::EnergyProfiler::print_all_timings_summary( + std::cout, all_timings.begin(), all_timings.end()); + KokkosTools::EnergyProfiler::export_all_timings_csv( + all_timings, prefix + "_timing_data.csv"); + + // Stop and export power data (completely independent of timing) + if (KokkosTools::EnergyProfiler::g_power_sampler) { + KokkosTools::EnergyProfiler::g_power_sampler->stop_sampling(); + + auto power_samples = + KokkosTools::EnergyProfiler::g_power_sampler->get_samples(); + if (!power_samples.empty()) { + KokkosTools::EnergyProfiler::print_power_summary(power_samples, + "NVIDIA GPU"); + KokkosTools::EnergyProfiler::export_power_data_csv( + power_samples, prefix + "_power_data.csv"); + } + + KokkosTools::EnergyProfiler::g_power_sampler->finalize(); + KokkosTools::EnergyProfiler::g_power_sampler.reset(); + } + + std::cout << "Kokkos Energy Profiler: Library finalized" << std::endl; +} + +// Begin parallel_for +void kokkosp_begin_parallel_for(const char* name, const uint32_t devID, + uint64_t* kID) { + if (!name || !kID) { + std::cerr << "EnergyProfiler ERROR: Error: Invalid parameters in " + "kokkosp_begin_parallel_for" + << std::endl; + return; + } + (void)devID; + uint64_t new_id = KokkosTools::EnergyProfiler::generate_new_region_id(); + *kID = new_id; + KokkosTools::EnergyProfiler::start_region( + name, KokkosTools::EnergyProfiler::RegionType::ParallelFor, *kID); + KokkosTools::EnergyProfiler::log_verbose( + std::string("Kokkos Energy Profiler: Started parallel_for '") + name + + "' on device " + std::to_string(devID) + " with ID " + + std::to_string(*kID)); +} + +// End parallel_for +void kokkosp_end_parallel_for(const uint64_t kID) { + if (kID == 0) { + std::cerr << "Error: Invalid kernel ID in kokkosp_end_parallel_for\n"; + return; + } + KokkosTools::EnergyProfiler::end_region_with_id(kID); + KokkosTools::EnergyProfiler::log_verbose( + std::string("Kokkos Energy Profiler: Ended parallel_for with ID ") + + std::to_string(kID)); +} + +// Begin parallel_scan +void kokkosp_begin_parallel_scan(const char* name, const uint32_t devID, + uint64_t* kID) { + if (!name || !kID) { + std::cerr << "Error: Invalid parameters in kokkosp_begin_parallel_scan\n"; + return; + } + (void)devID; + uint64_t new_id = KokkosTools::EnergyProfiler::generate_new_region_id(); + *kID = new_id; + KokkosTools::EnergyProfiler::start_region( + name, KokkosTools::EnergyProfiler::RegionType::ParallelScan, *kID); + KokkosTools::EnergyProfiler::log_verbose( + std::string("Kokkos Energy Profiler: Started parallel_scan '") + name + + "' on device " + std::to_string(devID) + " with ID " + + std::to_string(*kID)); +} + +// End parallel_scan +void kokkosp_end_parallel_scan(const uint64_t kID) { + if (kID == 0) { + std::cerr << "Error: Invalid kernel ID in kokkosp_end_parallel_scan\n"; + return; + } + KokkosTools::EnergyProfiler::end_region_with_id(kID); + KokkosTools::EnergyProfiler::log_verbose( + std::string("Kokkos Energy Profiler: Ended parallel_scan with ID ") + + std::to_string(kID)); +} + +// Begin parallel_reduce +void kokkosp_begin_parallel_reduce(const char* name, const uint32_t devID, + uint64_t* kID) { + if (!name || !kID) { + std::cerr << "Error: Invalid parameters in kokkosp_begin_parallel_reduce\n"; + return; + } + (void)devID; + uint64_t new_id = KokkosTools::EnergyProfiler::generate_new_region_id(); + *kID = new_id; + KokkosTools::EnergyProfiler::start_region( + name, KokkosTools::EnergyProfiler::RegionType::ParallelReduce, *kID); + KokkosTools::EnergyProfiler::log_verbose( + std::string("Kokkos Energy Profiler: Started parallel_reduce '") + name + + "' on device " + std::to_string(devID) + " with ID " + + std::to_string(*kID)); +} + +// End parallel_reduce +void kokkosp_end_parallel_reduce(const uint64_t kID) { + if (kID == 0) { + std::cerr << "Error: Invalid kernel ID in kokkosp_end_parallel_reduce\n"; + return; + } + KokkosTools::EnergyProfiler::end_region_with_id(kID); + KokkosTools::EnergyProfiler::log_verbose( + std::string("Kokkos Energy Profiler: Ended parallel_reduce with ID ") + + std::to_string(kID)); +} + +// Push user region +void kokkosp_push_profile_region(char const* regionName) { + if (!regionName) { + std::cerr << "Error: Invalid region name in kokkosp_push_profile_region\n"; + return; + } + uint64_t new_id = KokkosTools::EnergyProfiler::generate_new_region_id(); + KokkosTools::EnergyProfiler::start_region( + regionName, KokkosTools::EnergyProfiler::RegionType::UserRegion, new_id); + KokkosTools::EnergyProfiler::log_verbose( + std::string("Kokkos Energy Profiler: Pushed profile region '") + + regionName + "'"); +} + +// Pop user region +void kokkosp_pop_profile_region() { + KokkosTools::EnergyProfiler::end_region_by_type( + KokkosTools::EnergyProfiler::RegionType::UserRegion); + KokkosTools::EnergyProfiler::log_verbose( + "Kokkos Energy Profiler: Popped profile region"); +} + +// Begin deep copy +void kokkosp_begin_deep_copy(Kokkos::Tools::SpaceHandle, const char* dst_name, + const void*, Kokkos::Tools::SpaceHandle, + const char* src_name, const void*, uint64_t size) { + if (!dst_name || !src_name) { + std::cerr << "Error: Invalid names in kokkosp_begin_deep_copy\n"; + return; + } + uint64_t new_id = KokkosTools::EnergyProfiler::generate_new_region_id(); + std::string name = std::string(src_name) + " -> " + std::string(dst_name); + KokkosTools::EnergyProfiler::start_region( + name, KokkosTools::EnergyProfiler::RegionType::DeepCopy, new_id); + KokkosTools::EnergyProfiler::log_verbose( + std::string("Kokkos Energy Profiler: Started deep copy from '") + + src_name + "' to '" + dst_name + "' (size: " + std::to_string(size) + + " bytes)"); +} + +// End deep copy +void kokkosp_end_deep_copy() { + KokkosTools::EnergyProfiler::end_region_by_type( + KokkosTools::EnergyProfiler::RegionType::DeepCopy); + KokkosTools::EnergyProfiler::log_verbose( + "Kokkos Energy Profiler: Ended deep copy"); +} + +} // extern "C" diff --git a/profiling/energy-profiler/nvml_provider.cpp b/profiling/energy-profiler/nvml_provider.cpp new file mode 100644 index 000000000..064bcfcad --- /dev/null +++ b/profiling/energy-profiler/nvml_provider.cpp @@ -0,0 +1,174 @@ +//@HEADER +// ************************************************************************ +// +// Kokkos Power Profiler +// +// Under the terms of Contract DE-NA0003525 with NTESS, +// the U.S. Government retains certain rights in this software. +// +// Part of Kokkos, under the Apache License v2.0 with LLVM Exceptions. +// See https://kokkos.org/LICENSE for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//@HEADER + +/** + * @file nvml_provider.cpp + * @brief NVML Provider implementation for GPU power monitoring. + */ + +#include "nvml_provider.hpp" +#include +#include + +namespace KokkosTools { +namespace EnergyProfiler { + +NVMLProvider::NVMLProvider() : is_initialized_(false) {} + +NVMLProvider::~NVMLProvider() { + if (is_initialized_) { + finalize(); + } +} + +bool NVMLProvider::initialize() { + if (is_initialized_) return true; + + nvmlReturn_t nvml_result = nvmlInit(); + if (NVML_SUCCESS != nvml_result) { + std::cerr << "[KokkosPowerProfiler] ERROR: Failed to initialize NVML: " + << nvmlErrorString(nvml_result) << std::endl; + return false; + } + + if (!discover_devices()) { + nvmlShutdown(); + return false; + } + + is_initialized_ = true; + return true; +} + +void NVMLProvider::finalize() { + if (!is_initialized_) return; + devices_.clear(); + device_names_.clear(); + nvmlShutdown(); + is_initialized_ = false; +} + +bool NVMLProvider::get_total_power_usage(double& power_watts) const { + if (!is_initialized_) { + std::cerr << "[KokkosPowerProfiler] ERROR: Provider not initialized" + << std::endl; + return false; + } + + power_watts = 0.0; + for (size_t i = 0; i < devices_.size(); ++i) { + double device_power = 0.0; + if (get_device_power_usage(i, device_power)) { + power_watts += device_power; + } else { + // If one device fails, we continue with the others but log an error + std::cerr << "[KokkosPowerProfiler] WARNING: Failed to get power usage " + "for device " + << i << ", skipping it." << std::endl; + } + } + return true; +} + +bool NVMLProvider::get_device_power_usage(size_t device_index, + double& power_watts) const { + if (!validate_device_index(device_index)) return false; + + unsigned int power_mW = 0; + nvmlReturn_t nvml_result = + nvmlDeviceGetPowerUsage(devices_[device_index], &power_mW); + + if (nvml_result == NVML_SUCCESS) { + power_watts = static_cast(power_mW) / 1000.0; + return true; + } else { + std::cerr + << "[KokkosPowerProfiler] ERROR: Failed to get power usage for device " + << device_index << ": " << nvmlErrorString(nvml_result) << std::endl; + return false; + } +} + +std::string NVMLProvider::get_device_name(size_t device_index) const { + if (device_index >= device_names_.size()) return "Unknown Device"; + return device_names_[device_index]; +} + +bool NVMLProvider::discover_devices() { + unsigned int device_count; + nvmlReturn_t nvml_result = nvmlDeviceGetCount(&device_count); + + if (NVML_SUCCESS != nvml_result) { + std::cerr << "[KokkosPowerProfiler] ERROR: Failed to get device count: " + << nvmlErrorString(nvml_result) << std::endl; + return false; + } + + if (device_count == 0) { + std::cerr << "[KokkosPowerProfiler] ERROR: No NVIDIA devices found" + << std::endl; + return false; + } + + devices_.resize(device_count); + device_names_.resize(device_count); + + for (unsigned int i = 0; i < device_count; ++i) { + nvml_result = nvmlDeviceGetHandleByIndex(i, &devices_[i]); + if (NVML_SUCCESS != nvml_result) { + std::cerr + << "[KokkosPowerProfiler] WARNING: Failed to get handle for device " + << i << std::endl; + devices_[i] = nullptr; + device_names_[i] = "Failed Device"; + continue; + } + + char device_name[NVML_DEVICE_NAME_BUFFER_SIZE]; + nvml_result = nvmlDeviceGetName(devices_[i], device_name, + NVML_DEVICE_NAME_BUFFER_SIZE); + if (NVML_SUCCESS == nvml_result) { + device_names_[i] = std::string(device_name); + } else { + device_names_[i] = "Unknown Device " + std::to_string(i); + } + + // Test power usage reading + unsigned int test_power_mW = 0; + nvml_result = nvmlDeviceGetPowerUsage(devices_[i], &test_power_mW); + if (NVML_SUCCESS != nvml_result) { + std::cerr << "[KokkosPowerProfiler] WARNING: Device " << i + << ": Power usage reading failed: " + << nvmlErrorString(nvml_result) << std::endl; + } + } + return true; +} + +bool NVMLProvider::validate_device_index(size_t device_index) const { + if (!is_initialized_) { + std::cerr << "[KokkosPowerProfiler] ERROR: Provider not initialized" + << std::endl; + return false; + } + if (device_index >= devices_.size()) { + std::cerr << "[KokkosPowerProfiler] ERROR: Device index " << device_index + << " out of range" << std::endl; + return false; + } + return true; +} + +} // namespace EnergyProfiler +} // namespace KokkosTools diff --git a/profiling/energy-profiler/nvml_provider.hpp b/profiling/energy-profiler/nvml_provider.hpp new file mode 100644 index 000000000..f9d41907f --- /dev/null +++ b/profiling/energy-profiler/nvml_provider.hpp @@ -0,0 +1,54 @@ +//@HEADER +// ************************************************************************ +// +// Kokkos Power Profiler +// +// Under the terms of Contract DE-NA0003525 with NTESS, +// the U.S. Government retains certain rights in this software. +// +// Part of Kokkos, under the Apache License v2.0 with LLVM Exceptions. +// See https://kokkos.org/LICENSE for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//@HEADER + +/** + * @file nvml_provider.hpp + * @brief NVML Provider for GPU power monitoring. + */ + +#pragma once + +#include +#include +#include + +namespace KokkosTools { +namespace EnergyProfiler { + +class NVMLProvider { + public: + NVMLProvider(); + ~NVMLProvider(); + + bool initialize(); + void finalize(); + + bool get_total_power_usage(double& power_watts) const; + bool get_device_power_usage(size_t device_index, double& power_watts) const; + + size_t get_device_count() const { return devices_.size(); } + std::string get_device_name(size_t device_index) const; + bool is_initialized() const { return is_initialized_; } + + private: + bool is_initialized_; + std::vector devices_; + std::vector device_names_; + + bool discover_devices(); + bool validate_device_index(size_t device_index) const; +}; + +} // namespace EnergyProfiler +} // namespace KokkosTools diff --git a/profiling/energy-profiler/power_sampler.cpp b/profiling/energy-profiler/power_sampler.cpp new file mode 100644 index 000000000..d549330b5 --- /dev/null +++ b/profiling/energy-profiler/power_sampler.cpp @@ -0,0 +1,121 @@ +//@HEADER +// ************************************************************************ +// +// Kokkos Power Profiler +// +// Under the terms of Contract DE-NA0003525 with NTESS, +// the U.S. Government retains certain rights in this software. +// +// Part of Kokkos, under the Apache License v2.0 with LLVM Exceptions. +// See https://kokkos.org/LICENSE for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//@HEADER + +/** + * @file power_sampler.cpp + * @brief Power sampling implementation using NVML and daemon. + */ + +#include "power_sampler.hpp" +#include + +namespace KokkosTools { +namespace EnergyProfiler { + +PowerSampler::PowerSampler() : sampling_active_(false) {} + +PowerSampler::~PowerSampler() { + if (sampling_active_.load()) { + stop_sampling(); + } + finalize(); +} + +bool PowerSampler::initialize() { + if (nvml_provider_) return true; + + nvml_provider_ = std::make_unique(); + return nvml_provider_->initialize(); +} + +void PowerSampler::finalize() { + if (nvml_provider_) { + nvml_provider_->finalize(); + nvml_provider_.reset(); + } +} + +void PowerSampler::start_sampling() { + if (!nvml_provider_ || !nvml_provider_->is_initialized()) { + std::cerr << "[KokkosPowerProfiler] ERROR: Cannot start sampling - NVML " + "provider not initialized" + << std::endl; + return; + } + + if (sampling_active_.load()) { + std::cerr << "[KokkosPowerProfiler] WARNING: Sampling already active" + << std::endl; + return; + } + + sampling_active_.store(true); + daemon_ = std::make_unique([this]() { this->sample_power(); }, + SAMPLING_INTERVAL_MS); + daemon_->start(); + + std::cout << "[KokkosPowerProfiler] INFO: Started power sampling" + << std::endl; +} + +void PowerSampler::stop_sampling() { + if (!sampling_active_.load()) { + return; + } + + sampling_active_.store(false); + if (daemon_) { + daemon_->stop(); + daemon_.reset(); + } + + std::cout << "[KokkosPowerProfiler] INFO: Stopped power sampling" + << std::endl; +} + +std::vector PowerSampler::get_samples() const { + std::lock_guard lock(samples_mutex_); + return power_samples_; +} + +void PowerSampler::clear_samples() { + std::lock_guard lock(samples_mutex_); + power_samples_.clear(); +} + +void PowerSampler::sample_power() { + if (!sampling_active_.load() || !nvml_provider_) { + return; + } + + double current_power = 0.0; + bool success = nvml_provider_->get_total_power_usage(current_power); + if (!success) { + std::cerr + << "[KokkosPowerProfiler] WARNING: Failed to get total power usage" + << std::endl; + return; + } + + auto current_timestamp = std::chrono::high_resolution_clock::now(); + + // Store sample + { + std::lock_guard lock(samples_mutex_); + power_samples_.push_back({current_timestamp, current_power}); + } +} + +} // namespace EnergyProfiler +} // namespace KokkosTools diff --git a/profiling/energy-profiler/power_sampler.hpp b/profiling/energy-profiler/power_sampler.hpp new file mode 100644 index 000000000..ab7b9c3cf --- /dev/null +++ b/profiling/energy-profiler/power_sampler.hpp @@ -0,0 +1,69 @@ +//@HEADER +// ************************************************************************ +// +// Kokkos Power Profiler +// +// Under the terms of Contract DE-NA0003525 with NTESS, +// the U.S. Government retains certain rights in this software. +// +// Part of Kokkos, under the Apache License v2.0 with LLVM Exceptions. +// See https://kokkos.org/LICENSE for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//@HEADER + +/** + * @file power_sampler.hpp + * @brief Power sampling functionality using NVML and daemon. + */ + +#pragma once + +#include +#include +#include +#include +#include +#include "nvml_provider.hpp" +#include "daemon.hpp" +#include "energy_profiler_constants.hpp" + +namespace KokkosTools { +namespace EnergyProfiler { + +struct PowerSample { + std::chrono::high_resolution_clock::time_point timestamp; + double power_watts; +}; + +class PowerSampler { + public: + PowerSampler(); + ~PowerSampler(); + + bool initialize(); + void finalize(); + + void start_sampling(); + void stop_sampling(); + + std::vector get_samples() const; + void clear_samples(); + + bool is_initialized() const { + return nvml_provider_ && nvml_provider_->is_initialized(); + } + bool is_sampling() const { return sampling_active_.load(); } + + private: + std::unique_ptr nvml_provider_; + std::unique_ptr daemon_; + mutable std::mutex samples_mutex_; + std::vector power_samples_; + std::atomic sampling_active_; + + void sample_power(); +}; + +} // namespace EnergyProfiler +} // namespace KokkosTools diff --git a/profiling/energy-profiler/timing_export.cpp b/profiling/energy-profiler/timing_export.cpp new file mode 100644 index 000000000..bcc1ce6e0 --- /dev/null +++ b/profiling/energy-profiler/timing_export.cpp @@ -0,0 +1,143 @@ +//@HEADER +// ************************************************************************ +// +// Kokkos v. 4.0 +// Copyright (2022) National Technology & Engineering +// Solutions of Sandia, LLC (NTESS). +// +// Under the terms of Contract DE-NA0003525 with NTESS, +// the U.S. Government retains certain rights in this software. +// +// Part of Kokkos, under the Apache License v2.0 with LLVM Exceptions. +// See https://kokkos.org/LICENSE for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//@HEADER + +#include "timing_export.hpp" +#include +#include +#include +#include "energy_profiler_constants.hpp" + +namespace KokkosTools { +namespace EnergyProfiler { + +void export_all_timings_csv(const std::vector& all_timings, + const std::string& filename) { + std::ofstream file(filename); + if (!file.is_open()) { + std::cerr << "EnergyProfiler ERROR: Unable to open file " << filename + << " for writing." << std::endl; + return; + } + file << "name,type,start_time_epoch_ms,end_time_epoch_ms,duration_ms\n"; + for (const auto& timing : all_timings) { + auto start_ms = get_epoch_ms(timing.start_time); + auto end_ms = get_epoch_ms(timing.end_time); + auto duration_ms = get_duration_ms(timing.start_time, timing.end_time); + std::string type_str = region_type_to_string(timing.type); + file << timing.name << "," << type_str << "," << start_ms << "," << end_ms + << "," << duration_ms << "\n"; + } + std::cout << "All timing data exported to " << filename << '\n'; +} + +std::string get_category_from_type(RegionType type) { + switch (type) { + case RegionType::UserRegion: return "REGION"; + case RegionType::DeepCopy: return "DEEPCOPY"; + case RegionType::ParallelFor: + case RegionType::ParallelScan: + case RegionType::ParallelReduce: return "KERNEL"; + default: return "OTHER"; + } +} + +void print_all_timings_summary(std::ostream& os, + std::vector::const_iterator begin, + std::vector::const_iterator end) { + os << "\n==== TIMING SUMMARY ====\n"; + os << "| Category | Name | Type " + " | Start (ms) | End (ms) | Duration (ms) |\n"; + os << "|------------|----------------------------------|--------------" + "--|-------------------|-------------------|---------------|\n"; + for (auto it = begin; it != end; ++it) { + const auto& timing_info = *it; + auto start_ms = get_epoch_ms(timing_info.start_time); + auto end_ms = get_epoch_ms(timing_info.end_time); + auto duration_ms = + get_duration_ms(timing_info.start_time, timing_info.end_time); + std::string type_str = region_type_to_string(timing_info.type); + std::string category = get_category_from_type(timing_info.type); + os << "| " << std::setw(COLUMN_WIDTH_CATEGORY) << std::left << category + << " | " << std::setw(COLUMN_WIDTH_NAME) << std::left << timing_info.name + << " | " << std::setw(COLUMN_WIDTH_TYPE) << std::left << type_str + << " | " << std::setw(COLUMN_WIDTH_TIME) << std::right << start_ms + << " | " << std::setw(COLUMN_WIDTH_TIME) << std::right << end_ms << " | " + << std::setw(COLUMN_WIDTH_DURATION) << std::right << duration_ms + << " |\n"; + } +} + +void export_power_data_csv(const std::vector& samples, + const std::string& filename) { + std::ofstream file(filename); + if (!file.is_open()) { + std::cerr + << "EnergyProfiler ERROR: [KokkosPowerProfiler] Unable to open file " + << filename << " for writing." << std::endl; + return; + } + file << "timestamp_epoch_ms,power_watts\n"; + for (const auto& sample : samples) { + auto timestamp_ms = get_epoch_ms(sample.timestamp); + file << timestamp_ms << "," << std::fixed << std::setprecision(3) + << sample.power_watts << "\n"; + } + file.close(); + std::cout << "[KokkosPowerProfiler] INFO: Power data exported to " << filename + << std::endl; +} + +void print_power_summary(const std::vector& samples, + const std::string& device_name) { + if (samples.empty()) { + std::cout << "[KokkosPowerProfiler] INFO: No power samples collected.\n"; + return; + } + + // Calculate statistics + double min_power = samples[0].power_watts; + double max_power = samples[0].power_watts; + double power_sum = 0.0; + + for (const auto& sample : samples) { + double power = sample.power_watts; + power_sum += power; + if (power < min_power) min_power = power; + if (power > max_power) max_power = power; + } + + double avg_power = power_sum / samples.size(); + + // Calculate duration + auto start_time = samples.front().timestamp; + auto end_time = samples.back().timestamp; + auto duration_s = + std::chrono::duration(end_time - start_time).count(); + + std::cout << "\n==== POWER PROFILE SUMMARY ====\n"; + std::cout << std::fixed << std::setprecision(2); + std::cout << "Device: " << device_name << "\n"; + std::cout << "Total Monitoring Duration: " << duration_s << " s\n"; + std::cout << "Samples Collected: " << samples.size() << "\n"; + std::cout << "---------------------------------\n"; + std::cout << "Average Power: " << avg_power << " W\n"; + std::cout << "Minimum Power: " << min_power << " W\n"; + std::cout << "Maximum Power: " << max_power << " W\n"; + std::cout << "===============================\n"; +} + +} // namespace EnergyProfiler +} // namespace KokkosTools diff --git a/profiling/energy-profiler/timing_export.hpp b/profiling/energy-profiler/timing_export.hpp new file mode 100644 index 000000000..f4c359743 --- /dev/null +++ b/profiling/energy-profiler/timing_export.hpp @@ -0,0 +1,39 @@ +//@HEADER +// ************************************************************************ +// +// Kokkos v. 4.0 +// Copyright (2022) National Technology & Engineering +// Solutions of Sandia, LLC (NTESS). +// +// Under the terms of Contract DE-NA0003525 with NTESS, +// the U.S. Government retains certain rights in this software. +// +// Part of Kokkos, under the Apache License v2.0 with LLVM Exceptions. +// See https://kokkos.org/LICENSE for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//@HEADER + +#pragma once + +#include +#include +#include "timing_utils.hpp" +#include "power_sampler.hpp" + +namespace KokkosTools { +namespace EnergyProfiler { + +void export_all_timings_csv(const std::vector& all_timings, + const std::string& filename); +void print_all_timings_summary(std::ostream& os, + std::vector::const_iterator begin, + std::vector::const_iterator end); + +void export_power_data_csv(const std::vector& samples, + const std::string& filename); +void print_power_summary(const std::vector& samples, + const std::string& device_name = "N/A"); + +} // namespace EnergyProfiler +} // namespace KokkosTools diff --git a/profiling/energy-profiler/timing_utils.cpp b/profiling/energy-profiler/timing_utils.cpp new file mode 100644 index 000000000..9116d0eda --- /dev/null +++ b/profiling/energy-profiler/timing_utils.cpp @@ -0,0 +1,46 @@ +//@HEADER +// ************************************************************************ +// +// Kokkos v. 4.0 +// Copyright (2022) National Technology & Engineering +// Solutions of Sandia, LLC (NTESS). +// +// Under the terms of Contract DE-NA0003525 with NTESS, +// the U.S. Government retains certain rights in this software. +// +// Part of Kokkos, under the Apache License v2.0 with LLVM Exceptions. +// See https://kokkos.org/LICENSE for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//@HEADER + +#include "timing_utils.hpp" +#include +#include + +namespace KokkosTools { +namespace EnergyProfiler { + +std::string generate_prefix() { + char hostname[HOSTNAME_BUFFER_SIZE]; + if (gethostname(hostname, sizeof(hostname)) != 0) { + // Fallback to "unknown" if hostname fails + std::strncpy(hostname, "unknown", sizeof(hostname)); + } + int pid = (int)getpid(); + return std::string(hostname) + "-" + std::to_string(pid); +} + +std::string region_type_to_string(RegionType type) { + switch (type) { + case RegionType::ParallelFor: return "parallel_for"; + case RegionType::ParallelScan: return "parallel_scan"; + case RegionType::ParallelReduce: return "parallel_reduce"; + case RegionType::DeepCopy: return "deep_copy"; + case RegionType::UserRegion: return "user_region"; + default: return "unknown"; + } +} + +} // namespace EnergyProfiler +} // namespace KokkosTools diff --git a/profiling/energy-profiler/timing_utils.hpp b/profiling/energy-profiler/timing_utils.hpp new file mode 100644 index 000000000..441da0973 --- /dev/null +++ b/profiling/energy-profiler/timing_utils.hpp @@ -0,0 +1,121 @@ +//@HEADER +// ************************************************************************ +// +// Kokkos v. 4.0 +// Copyright (2022) National Technology & Engineering +// Solutions of Sandia, LLC (NTESS). +// +// Under the terms of Contract DE-NA0003525 with NTESS, +// the U.S. Government retains certain rights in this software. +// +// Part of Kokkos, under the Apache License v2.0 with LLVM Exceptions. +// See https://kokkos.org/LICENSE for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//@HEADER + +#pragma once + +#include +#include +#include +#include +#include "energy_profiler_constants.hpp" + +namespace KokkosTools { +namespace EnergyProfiler { + +// Helper functions for region type conversion +/// @brief Enumeration of region types +enum class RegionType { + Unknown, + ParallelFor, + ParallelReduce, + ParallelScan, + DeepCopy, + UserRegion +}; + +/// @brief Structure to hold timing information +struct TimingInfo { + std::string name; + RegionType type; + std::chrono::high_resolution_clock::time_point start_time; + std::chrono::high_resolution_clock::time_point end_time; + uint64_t id = 0; +}; + +// Singleton class to manage global state +/// @brief Singleton class for managing profiler state +class EnergyProfilerState { + public: + static EnergyProfilerState& get_instance() { + static EnergyProfilerState instance; + return instance; + } + + // Delete copy and move operations + EnergyProfilerState(const EnergyProfilerState&) = delete; + EnergyProfilerState& operator=(const EnergyProfilerState&) = delete; + EnergyProfilerState(EnergyProfilerState&&) = delete; + EnergyProfilerState& operator=(EnergyProfilerState&&) = delete; + + // Accessors for state + std::mutex& get_mutex() { return mutex_; } + std::vector& get_active_regions() { return active_regions_; } + std::vector& get_completed_timings() { + return completed_timings_; + } + uint64_t get_next_region_id() const { return next_region_id_; } + bool get_verbose_enabled() const { return verbose_enabled_; } + + // Safe setters + void increment_next_region_id() { next_region_id_++; } + void set_verbose_enabled(bool enabled) { verbose_enabled_ = enabled; } + + private: + EnergyProfilerState() : next_region_id_(1), verbose_enabled_(false) {} + + std::mutex mutex_; + std::vector active_regions_; + std::vector completed_timings_; + uint64_t next_region_id_; + bool verbose_enabled_; +}; + +// Internal functions for region management +void start_region(const std::string& name, RegionType type, uint64_t id); +void end_region_by_type(RegionType type_to_end); +void end_region_with_id(uint64_t expected_id); +uint64_t generate_new_region_id(); +bool is_verbose_enabled(); +void set_verbose_enabled(bool enabled); +void log_verbose(const std::string& message); +std::vector get_all_timings(); + +// Filename prefix generation +/// @brief Generate a prefix for output files based on hostname and PID +std::string generate_prefix(); + +/// @brief Convert RegionType to string +std::string region_type_to_string(RegionType type); + +// Helper functions for timing calculations +/// @brief Get epoch milliseconds from time point +template +long get_epoch_ms(const TimePoint& time_point) { + return std::chrono::duration_cast( + time_point.time_since_epoch()) + .count(); +} + +/// @brief Get duration in milliseconds between two time points +template +long get_duration_ms(const TimePoint& start_time, const TimePoint& end_time) { + auto duration = end_time - start_time; + return std::chrono::duration_cast(duration) + .count(); +} + +} // namespace EnergyProfiler +} // namespace KokkosTools