Skip to content

[FEATURE] Add Per-thread runtime distribution example for bpftime GPU tracing #505

@yunwei37

Description

@yunwei37

This is a demo of how it works, you should write the probe in eBPF, and add it to https://github.com/eunomia-bpf/bpftime/tree/master/example/gpu and make sure user don't need to modify their program to do the tracing.

Program 2: Per-thread runtime distribution using clock64()

This program measures a per-thread runtime distribution for a simple synthetic workload inside a kernel. The idea:

  • Each thread runs a small loop with a different number of iterations (dependent on its global thread ID).
  • At the beginning and end of the loop, the thread reads clock64() (a per-SM cycle counter).
  • The difference end - start gives the number of cycles spent in the “work” section for that thread.
  • On the host, we convert cycles to microseconds and print per-thread timings plus simple summary statistics.

This is the manual, in-kernel counterpart to what you would conceptually want from a “per-thread runtime profiler.”

// thread_time_distribution_demo.cu
#include <cstdio>
#include <cstdlib>
#include <cuda_runtime.h>

// Simple CUDA error checking
#define CHECK_CUDA(call)                                                   \
    do {                                                                   \
        cudaError_t err__ = (call);                                        \
        if (err__ != cudaSuccess) {                                        \
            fprintf(stderr, "CUDA error %s (%d) at %s:%d\n",               \
                    cudaGetErrorString(err__), err__, __FILE__, __LINE__); \
            std::exit(EXIT_FAILURE);                                       \
        }                                                                  \
    } while (0)

// Per-thread timing info
struct ThreadTime {
    unsigned int blockId;     // block index (1D grid)
    unsigned int threadId;    // global thread ID (1D)
    unsigned long long cycles; // cycles spent in work section
};

// Kernel: each thread runs a synthetic workload, and we measure its time
__global__ void timed_work_kernel(ThreadTime* out, int base_iters)
{
    const unsigned int blockId = blockIdx.x;
    const unsigned int threadIdInBlock = threadIdx.x;
    const unsigned int globalThreadId =
        blockIdx.x * blockDim.x + threadIdx.x;

    // Each thread runs a slightly different number of iterations
    // to produce a non-trivial distribution.
    int my_iters = base_iters + (globalThreadId & 0x1F); // vary per thread

    // Time the "work" section using clock64()
    unsigned long long start = clock64();

    volatile float acc = 0.0f;
    for (int i = 0; i < my_iters; ++i) {
        acc += 1.0f;  // trivial arithmetic
    }

    unsigned long long end = clock64();

    // Prevent the compiler from optimizing out the loop completely
    if (acc == -1.0f) {
        printf("This will never be printed\n");
    }

    ThreadTime t;
    t.blockId  = blockId;
    t.threadId = globalThreadId;
    t.cycles   = end - start;

    out[globalThreadId] = t;
}

int main()
{
    // Again, keep it small for printing
    const int BLOCKS            = 4;
    const int THREADS_PER_BLOCK = 64;
    const int TOTAL_THREADS     = BLOCKS * THREADS_PER_BLOCK;

    const int BASE_ITERS        = 100000; // base workload per thread

    printf("=== Per-thread runtime distribution demo ===\n");
    printf("Grid: %d blocks, Block: %d threads (total %d threads)\n",
           BLOCKS, THREADS_PER_BLOCK, TOTAL_THREADS);

    // Allocate device and host buffers
    ThreadTime* d_times = nullptr;
    CHECK_CUDA(cudaMalloc(&d_times, TOTAL_THREADS * sizeof(ThreadTime)));

    ThreadTime* h_times = (ThreadTime*)std::malloc(TOTAL_THREADS * sizeof(ThreadTime));
    if (!h_times) {
        fprintf(stderr, "Host malloc failed\n");
        std::exit(EXIT_FAILURE);
    }

    // Launch kernel
    dim3 grid(BLOCKS);
    dim3 block(THREADS_PER_BLOCK);

    timed_work_kernel<<<grid, block>>>(d_times, BASE_ITERS);
    CHECK_CUDA(cudaGetLastError());
    CHECK_CUDA(cudaDeviceSynchronize());

    // Copy results back
    CHECK_CUDA(cudaMemcpy(h_times, d_times,
                          TOTAL_THREADS * sizeof(ThreadTime),
                          cudaMemcpyDeviceToHost));

    // Get device clock rate to convert cycles -> time
    cudaDeviceProp prop;
    CHECK_CUDA(cudaGetDeviceProperties(&prop, 0));

    // clockRate: kHz (cycles per millisecond)
    const double sm_freq_hz = (double)prop.clockRate * 1000.0;

    printf("\nDevice: %s\n", prop.name);
    printf("SM clock rate: %.3f MHz\n\n", sm_freq_hz / 1.0e6);

    // Print per-thread timing
    printf("Per-thread timing:\n");
    printf("tid  block     cycles        time_us\n");
    printf("--------------------------------------\n");

    unsigned long long min_cycles = ~0ULL;
    unsigned long long max_cycles = 0;
    unsigned long long sum_cycles = 0;

    for (int i = 0; i < TOTAL_THREADS; ++i) {
        const ThreadTime& t = h_times[i];

        double time_sec = (double)t.cycles / sm_freq_hz;
        double time_us  = time_sec * 1.0e6;

        printf("%3u  %5u  %10llu  %10.3f\n",
               t.threadId, t.blockId,
               (unsigned long long)t.cycles, time_us);

        if (t.cycles < min_cycles) min_cycles = t.cycles;
        if (t.cycles > max_cycles) max_cycles = t.cycles;
        sum_cycles += t.cycles;
    }

    double avg_cycles = (double)sum_cycles / TOTAL_THREADS;
    double min_us = (double)min_cycles / sm_freq_hz * 1.0e6;
    double max_us = (double)max_cycles / sm_freq_hz * 1.0e6;
    double avg_us = avg_cycles / sm_freq_hz * 1.0e6;

    printf("\nSummary over all threads:\n");
    printf("  min cycles = %llu (%.3f us)\n",
           (unsigned long long)min_cycles, min_us);
    printf("  max cycles = %llu (%.3f us)\n",
           (unsigned long long)max_cycles, max_us);
    printf("  avg cycles = %.1f (%.3f us)\n",
           avg_cycles, avg_us);

    // Cleanup
    CHECK_CUDA(cudaFree(d_times));
    std::free(h_times);

    return 0;
}

Build and run:

nvcc -O2 -arch=sm_70 thread_time_distribution_demo.cu -o thread_time_demo
./thread_time_demo

From there you can:

  • Redirect the output to a file and build histograms in Python/R, or
  • Strip the printfs and instead dump the ThreadTime array to a binary/CSV for offline analysis.

If you want a warp-only or block-only distribution instead of per-thread, the only change is to have only one lane (or one thread per block) record the timing and aggregate locally before writing to global memory.

Metadata

Metadata

Assignees

No one assigned

    Labels

    Type

    No type

    Projects

    No projects

    Milestone

    No milestone

    Relationships

    None yet

    Development

    No branches or pull requests

    Issue actions