diff --git a/.github/workflows/build-with-kokkos.yml b/.github/workflows/build-with-kokkos.yml index 6eabf3383..a70528321 100644 --- a/.github/workflows/build-with-kokkos.yml +++ b/.github/workflows/build-with-kokkos.yml @@ -30,6 +30,7 @@ jobs: env: Kokkos_ROOT: /opt/kokkos VARIORUM_ROOT: /opt/variorum + ROCM_PATH: /opt/rocm steps: - name: Checkout Kokkos Tools uses: actions/checkout@v4 diff --git a/CMakeLists.txt b/CMakeLists.txt index 3113e3898..39ed3f2db 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -50,6 +50,43 @@ option(KokkosTools_ENABLE_SYSTEMTAP "Enable SystemTap support" OFF) option(KokkosTools_ENABLE_EXAMPLES "Build examples" OFF) option(KokkosTools_ENABLE_TESTS "Build tests" OFF) +# Configure CMAKE_PREFIX_PATH for ROCm/HIP if ROCM_PATH is set +# This is needed for find_package(Kokkos) when Kokkos was built with HIP support +set(ROCM_PATH_TO_USE "") +if(DEFINED ENV{ROCM_PATH} AND NOT "$ENV{ROCM_PATH}" STREQUAL "") + set(ROCM_PATH_TO_USE "$ENV{ROCM_PATH}") +elseif(EXISTS "/opt/rocm") + # Fallback to default ROCm installation path + set(ROCM_PATH_TO_USE "/opt/rocm") + message(STATUS "ROCM_PATH not set, using default: /opt/rocm") +endif() + +if(NOT "${ROCM_PATH_TO_USE}" STREQUAL "") + message(STATUS "Configuring ROCm paths from: ${ROCM_PATH_TO_USE}") + + # Add multiple potential locations for ROCm CMake configs + list(APPEND CMAKE_PREFIX_PATH "${ROCM_PATH_TO_USE}") + list(APPEND CMAKE_PREFIX_PATH "${ROCM_PATH_TO_USE}/lib/cmake") + list(APPEND CMAKE_PREFIX_PATH "${ROCM_PATH_TO_USE}/hip/lib/cmake") + list(APPEND CMAKE_PREFIX_PATH "${ROCM_PATH_TO_USE}/lib/cmake/hip") + + # Also set hip_DIR as a hint for find_package(hip) if not already set + # Try multiple potential locations + if(NOT DEFINED hip_DIR) + if(EXISTS "${ROCM_PATH_TO_USE}/lib/cmake/hip") + set(hip_DIR "${ROCM_PATH_TO_USE}/lib/cmake/hip" CACHE PATH "Path to hip CMake config") + message(STATUS "Setting hip_DIR hint: ${hip_DIR}") + elseif(EXISTS "${ROCM_PATH_TO_USE}/hip/lib/cmake/hip") + set(hip_DIR "${ROCM_PATH_TO_USE}/hip/lib/cmake/hip" CACHE PATH "Path to hip CMake config") + message(STATUS "Setting hip_DIR hint: ${hip_DIR}") + else() + message(STATUS "hip CMake config not found in standard locations under ${ROCM_PATH_TO_USE}") + endif() + endif() + + message(STATUS "CMAKE_PREFIX_PATH: ${CMAKE_PREFIX_PATH}") +endif() + # Fetch Kokkos options: acquire_kokkos_config() if(DEFINED Kokkos_FOUND_MSG) diff --git a/profiling/nvtx-connector/kp_nvtx_connector.cpp b/profiling/nvtx-connector/kp_nvtx_connector.cpp index 2e6fa9bf2..e3f654770 100644 --- a/profiling/nvtx-connector/kp_nvtx_connector.cpp +++ b/profiling/nvtx-connector/kp_nvtx_connector.cpp @@ -118,6 +118,10 @@ void kokkosp_stop_profile_section(const uint32_t sID) { nvtxRangeEnd(section.id); } +void kokkosp_destroy_profile_section(const uint32_t sID) { + // NVTX ranges are automatically managed, no explicit destroy needed +} + void kokkosp_profile_event(const char* name) { nvtxMarkA(name); } void kokkosp_begin_fence(const char* name, const uint32_t deviceId, @@ -147,23 +151,24 @@ Kokkos::Tools::Experimental::EventSet get_event_set() { Kokkos::Tools::Experimental::EventSet my_event_set; memset(&my_event_set, 0, sizeof(my_event_set)); // zero any pointers not set here - my_event_set.request_tool_settings = kokkosp_request_tool_settings; - my_event_set.init = kokkosp_init_library; - my_event_set.finalize = kokkosp_finalize_library; - my_event_set.push_region = kokkosp_push_profile_region; - my_event_set.pop_region = kokkosp_pop_profile_region; - my_event_set.begin_parallel_for = kokkosp_begin_parallel_for; - my_event_set.begin_parallel_reduce = kokkosp_begin_parallel_reduce; - my_event_set.begin_parallel_scan = kokkosp_begin_parallel_scan; - my_event_set.end_parallel_for = kokkosp_end_parallel_for; - my_event_set.end_parallel_reduce = kokkosp_end_parallel_reduce; - my_event_set.end_parallel_scan = kokkosp_end_parallel_scan; - my_event_set.create_profile_section = kokkosp_create_profile_section; - my_event_set.start_profile_section = kokkosp_start_profile_section; - my_event_set.stop_profile_section = kokkosp_stop_profile_section; - my_event_set.profile_event = kokkosp_profile_event; - my_event_set.begin_fence = kokkosp_begin_fence; - my_event_set.end_fence = kokkosp_end_fence; + my_event_set.request_tool_settings = kokkosp_request_tool_settings; + my_event_set.init = kokkosp_init_library; + my_event_set.finalize = kokkosp_finalize_library; + my_event_set.push_region = kokkosp_push_profile_region; + my_event_set.pop_region = kokkosp_pop_profile_region; + my_event_set.begin_parallel_for = kokkosp_begin_parallel_for; + my_event_set.begin_parallel_reduce = kokkosp_begin_parallel_reduce; + my_event_set.begin_parallel_scan = kokkosp_begin_parallel_scan; + my_event_set.end_parallel_for = kokkosp_end_parallel_for; + my_event_set.end_parallel_reduce = kokkosp_end_parallel_reduce; + my_event_set.end_parallel_scan = kokkosp_end_parallel_scan; + my_event_set.create_profile_section = kokkosp_create_profile_section; + my_event_set.start_profile_section = kokkosp_start_profile_section; + my_event_set.stop_profile_section = kokkosp_stop_profile_section; + my_event_set.destroy_profile_section = kokkosp_destroy_profile_section; + my_event_set.profile_event = kokkosp_profile_event; + my_event_set.begin_fence = kokkosp_begin_fence; + my_event_set.end_fence = kokkosp_end_fence; return my_event_set; } @@ -188,6 +193,7 @@ EXPOSE_END_PARALLEL_REDUCE(impl::kokkosp_end_parallel_reduce) EXPOSE_CREATE_PROFILE_SECTION(impl::kokkosp_create_profile_section) EXPOSE_START_PROFILE_SECTION(impl::kokkosp_start_profile_section) EXPOSE_STOP_PROFILE_SECTION(impl::kokkosp_stop_profile_section) +EXPOSE_DESTROY_PROFILE_SECTION(impl::kokkosp_destroy_profile_section) EXPOSE_PROFILE_EVENT(impl::kokkosp_profile_event); EXPOSE_BEGIN_FENCE(impl::kokkosp_begin_fence); EXPOSE_END_FENCE(impl::kokkosp_end_fence); diff --git a/profiling/vtune-connector/kp_vtune_connector.cpp b/profiling/vtune-connector/kp_vtune_connector.cpp index ef4550846..06285bd82 100644 --- a/profiling/vtune-connector/kp_vtune_connector.cpp +++ b/profiling/vtune-connector/kp_vtune_connector.cpp @@ -20,20 +20,42 @@ #include #include #include +#include +#include #include "kp_core.hpp" #include "kp_vtune_connector_domain.h" +namespace { +struct Section { + std::string label; + __itt_domain* domain; +}; +std::vector
kokkosp_sections; +std::stack<__itt_domain*> kokkosp_region_stack; +} // namespace + namespace KokkosTools { namespace VTuneConnector { static KernelVTuneConnectorInfo* currentKernel; static std::unordered_map domain_map; static uint64_t nextKernelID; +static bool tool_globfences = false; + +void kokkosp_request_tool_settings(const uint32_t, + Kokkos_Tools_ToolSettings* settings) { + settings->requires_global_fencing = tool_globfences; +} void kokkosp_init_library(const int loadSeq, const uint64_t interfaceVer, const uint32_t devInfoCount, Kokkos_Profiling_KokkosPDeviceInfo* deviceInfo) { + const char* tool_global_fences = getenv("KOKKOS_TOOLS_GLOBALFENCES"); + if (NULL != tool_global_fences) { + tool_globfences = (atoi(tool_global_fences) != 0); + } + printf("-----------------------------------------------------------\n"); printf("KokkosP: VTune Analyzer Connector (sequence is %d, version: %llu)\n", loadSeq, interfaceVer); @@ -129,18 +151,86 @@ void kokkosp_end_parallel_reduce(const uint64_t kID) { currentKernel = NULL; } +void kokkosp_push_profile_region(const char* name) { + __itt_domain* domain = __itt_domain_create(name); + domain->flags = 1; + kokkosp_region_stack.push(domain); + __itt_frame_begin_v3(domain, NULL); +} + +void kokkosp_pop_profile_region() { + if (!kokkosp_region_stack.empty()) { + __itt_domain* domain = kokkosp_region_stack.top(); + kokkosp_region_stack.pop(); + __itt_frame_end_v3(domain, NULL); + } +} + +void kokkosp_create_profile_section(const char* name, uint32_t* sID) { + *sID = kokkosp_sections.size(); + __itt_domain* domain = __itt_domain_create(name); + domain->flags = 1; + kokkosp_sections.push_back({std::string(name), domain}); +} + +void kokkosp_start_profile_section(const uint32_t sID) { + if (sID >= kokkosp_sections.size()) return; + auto& section = kokkosp_sections[sID]; + __itt_frame_begin_v3(section.domain, NULL); +} + +void kokkosp_stop_profile_section(const uint32_t sID) { + if (sID >= kokkosp_sections.size()) return; + auto const& section = kokkosp_sections[sID]; + __itt_frame_end_v3(section.domain, NULL); +} + +void kokkosp_destroy_profile_section(const uint32_t sID) { + // VTune domains are not explicitly destroyed +} + +void kokkosp_profile_event(const char* name) { + __itt_event event = __itt_event_create(name, strlen(name)); + __itt_event_start(event); +} + +void kokkosp_begin_fence(const char* name, const uint32_t /*deviceId*/, + uint64_t* handle) { + __itt_domain* domain = __itt_domain_create(name); + domain->flags = 1; + __itt_frame_begin_v3(domain, NULL); + // Store domain in handle for use in end_fence + // This is not ideal but VTune API doesn't provide a better way + *handle = reinterpret_cast(domain); +} + +void kokkosp_end_fence(uint64_t handle) { + __itt_domain* domain = reinterpret_cast<__itt_domain*>(handle); + __itt_frame_end_v3(domain, NULL); +} + Kokkos::Tools::Experimental::EventSet get_event_set() { Kokkos::Tools::Experimental::EventSet my_event_set; memset(&my_event_set, 0, sizeof(my_event_set)); // zero any pointers not set here - my_event_set.init = kokkosp_init_library; - my_event_set.finalize = kokkosp_finalize_library; - my_event_set.begin_parallel_for = kokkosp_begin_parallel_for; - my_event_set.begin_parallel_reduce = kokkosp_begin_parallel_reduce; - my_event_set.begin_parallel_scan = kokkosp_begin_parallel_scan; - my_event_set.end_parallel_for = kokkosp_end_parallel_for; - my_event_set.end_parallel_reduce = kokkosp_end_parallel_reduce; - my_event_set.end_parallel_scan = kokkosp_end_parallel_scan; + my_event_set.request_tool_settings = kokkosp_request_tool_settings; + my_event_set.init = kokkosp_init_library; + my_event_set.finalize = kokkosp_finalize_library; + my_event_set.push_region = kokkosp_push_profile_region; + my_event_set.pop_region = kokkosp_pop_profile_region; + my_event_set.begin_parallel_for = kokkosp_begin_parallel_for; + my_event_set.begin_parallel_reduce = kokkosp_begin_parallel_reduce; + my_event_set.begin_parallel_scan = kokkosp_begin_parallel_scan; + my_event_set.end_parallel_for = kokkosp_end_parallel_for; + my_event_set.end_parallel_reduce = kokkosp_end_parallel_reduce; + my_event_set.end_parallel_scan = kokkosp_end_parallel_scan; + my_event_set.create_profile_section = kokkosp_create_profile_section; + my_event_set.start_profile_section = kokkosp_start_profile_section; + my_event_set.stop_profile_section = kokkosp_stop_profile_section; + my_event_set.destroy_profile_section = kokkosp_destroy_profile_section; + my_event_set.profile_event = kokkosp_profile_event; + my_event_set.begin_fence = kokkosp_begin_fence; + my_event_set.end_fence = kokkosp_end_fence; return my_event_set; } @@ -151,13 +241,23 @@ extern "C" { namespace impl = KokkosTools::VTuneConnector; +EXPOSE_TOOL_SETTINGS(impl::kokkosp_request_tool_settings) EXPOSE_INIT(impl::kokkosp_init_library) EXPOSE_FINALIZE(impl::kokkosp_finalize_library) +EXPOSE_PUSH_REGION(impl::kokkosp_push_profile_region) +EXPOSE_POP_REGION(impl::kokkosp_pop_profile_region) EXPOSE_BEGIN_PARALLEL_FOR(impl::kokkosp_begin_parallel_for) EXPOSE_END_PARALLEL_FOR(impl::kokkosp_end_parallel_for) EXPOSE_BEGIN_PARALLEL_SCAN(impl::kokkosp_begin_parallel_scan) EXPOSE_END_PARALLEL_SCAN(impl::kokkosp_end_parallel_scan) EXPOSE_BEGIN_PARALLEL_REDUCE(impl::kokkosp_begin_parallel_reduce) EXPOSE_END_PARALLEL_REDUCE(impl::kokkosp_end_parallel_reduce) +EXPOSE_CREATE_PROFILE_SECTION(impl::kokkosp_create_profile_section) +EXPOSE_START_PROFILE_SECTION(impl::kokkosp_start_profile_section) +EXPOSE_STOP_PROFILE_SECTION(impl::kokkosp_stop_profile_section) +EXPOSE_DESTROY_PROFILE_SECTION(impl::kokkosp_destroy_profile_section) +EXPOSE_PROFILE_EVENT(impl::kokkosp_profile_event); +EXPOSE_BEGIN_FENCE(impl::kokkosp_begin_fence); +EXPOSE_END_FENCE(impl::kokkosp_end_fence); } // extern "C"