Skip to content
Open
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
119 changes: 107 additions & 12 deletions profiling/simple-kernel-timer/kp_kernel_timer.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,93 @@

namespace KokkosTools {
namespace KernelTimer {
void print_ascii(std::map<std::string, KernelPerformanceInfo*>& count_map,
double totalExecuteTime) {
std::vector<KernelPerformanceInfo*> kernelInfo;
double totalKernelsTime = 0;
uint64_t totalKernelsCalls = 0;

for (auto const& [name, info] : count_map) {
kernelInfo.push_back(info);
}

std::sort(kernelInfo.begin(), kernelInfo.end(), compareKernelPerformanceInfo);

// Calculate total time in kernels and total calls to kernels for summary
for (auto const& info : kernelInfo) {
if (info->getKernelType() != REGION) {
totalKernelsTime += info->getTime();
totalKernelsCalls += info->getCallCount();
}
}

// Header matching kp_reader.cpp
printf(
"\n (Type) Total Time, Call Count, Avg. Time per Call, %%Total Time in "
"Kernels, %%Total Program Time\n");
printf(
"------------------------------------------------------------------------"
"-\n\n");

char delimiter = ' ';
// We check for the environment delimiter if set during init
if (outputDelimiter != nullptr && strlen(outputDelimiter) > 0) {
delimiter = outputDelimiter[0];
}

auto print_row = [&](KernelPerformanceInfo* info) {
const double callCountDouble = (double)info->getCallCount();
const char* typeStr = " (Region) ";
switch (info->getKernelType()) {
case PARALLEL_FOR: typeStr = " (ParFor) "; break;
case PARALLEL_REDUCE: typeStr = " (ParRed) "; break;
case PARALLEL_SCAN: typeStr = " (ParScan) "; break;
Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

should we use parallel_for etc to make it easier for users to map it to Kokkos parallel dispatch?

Copy link
Copy Markdown
Author

@yasahi-hpc yasahi-hpc Mar 13, 2026

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I mean I just keep the original behavior from kp_reader.cpp.

Copy link
Copy Markdown
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

That's fine for now.

default: break;
}

printf(
"- %s\n%s%c%f%c%" PRIu64 "%c%f%c%f%c%f\n", info->getName().c_str(),
typeStr, delimiter, info->getTime(), delimiter, info->getCallCount(),
delimiter, info->getTime() / std::max(1.0, callCountDouble), delimiter,
(info->getTime() / std::max(1e-9, totalKernelsTime)) * 100.0, delimiter,
(info->getTime() / std::max(1e-9, totalExecuteTime)) * 100.0);
};

printf("Regions: \n\n");
for (auto const& info : kernelInfo) {
if (info->getKernelType() == REGION) print_row(info);
}

printf(
"\n----------------------------------------------------------------------"
"---\n");
printf("Kernels: \n\n");
for (auto const& info : kernelInfo) {
if (info->getKernelType() != REGION) print_row(info);
}

printf(
"\n----------------------------------------------------------------------"
"---\n");
printf("Summary:\n\n");
printf(
"Total Execution Time (incl. Kokkos + non-Kokkos): %20.5f seconds\n",
totalExecuteTime);
printf(
"Total Time in Kokkos kernels: %20.5f seconds\n",
totalKernelsTime);
printf(
" -> Time outside Kokkos kernels: %20.5f seconds\n",
(totalExecuteTime - totalKernelsTime));
printf(" -> Percentage in Kokkos kernels: %20.2f %%\n",
(totalKernelsTime / std::max(1e-9, totalExecuteTime)) * 100.0);
printf("Total Calls to Kokkos Kernels: %20" PRIu64
"\n",
totalKernelsCalls);
printf(
"------------------------------------------------------------------------"
"-\n\n");
}

bool is_region(KernelPerformanceInfo const& kp) {
return kp.getKernelType() == REGION;
Expand Down Expand Up @@ -42,15 +129,25 @@ void kokkosp_init_library(const int loadSeq, const uint64_t interfaceVer,
}

void kokkosp_finalize_library() {
double finishTime = seconds();
double finishTime = seconds();
const double totalExecuteTime = (finishTime - initTime);

const char* kokkos_tools_timer_json_raw = getenv("KOKKOS_TOOLS_TIMER_JSON");
const bool kokkos_tools_timer_json =
kokkos_tools_timer_json_raw == NULL
? false
: strcmp(kokkos_tools_timer_json_raw, "1") == 0 ||
strcmp(kokkos_tools_timer_json_raw, "true") == 0 ||
strcmp(kokkos_tools_timer_json_raw, "True") == 0;
auto is_enabled = [](const char* env_var) {
const char* env_var_raw = getenv(env_var);
return env_var_raw != nullptr &&
(strcmp(env_var_raw, "1") == 0 || strcmp(env_var_raw, "true") == 0 ||
strcmp(env_var_raw, "True") == 0);
};

const bool kokkos_tools_timer_json = is_enabled("KOKKOS_TOOLS_TIMER_JSON");
const bool kokkos_tools_timer_binary =
is_enabled("KOKKOS_TOOLS_TIMER_BINARY");

// Quick return for ascii output (default)
if (!kokkos_tools_timer_json && !kokkos_tools_timer_binary) {
print_ascii(count_map, totalExecuteTime);
return;
}

double kernelTimes = 0;

Expand All @@ -63,16 +160,14 @@ void kokkosp_finalize_library() {

free(hostname);
FILE* output_data = fopen(fileOutput, "wb");

const double totalExecuteTime = (finishTime - initTime);
if (!kokkos_tools_timer_json) {
if (kokkos_tools_timer_binary) {
fwrite(&totalExecuteTime, sizeof(totalExecuteTime), 1, output_data);

for (auto kernel_itr = count_map.begin(); kernel_itr != count_map.end();
kernel_itr++) {
kernel_itr->second->writeToBinaryFile(output_data);
}
} else {
} else if (kokkos_tools_timer_json) {
std::vector<KernelPerformanceInfo*> kernelList;

for (auto kernel_itr = count_map.begin(); kernel_itr != count_map.end();
Expand Down