|
| 1 | +#include <cuda_runtime.h> |
| 2 | + |
| 3 | +// WARN: for simplicity, we enable and "embed" the Tracy client directly into the code |
| 4 | +#define TRACY_ENABLE |
| 5 | +#include <TracyClient.cpp> |
| 6 | + |
| 7 | +#include <tracy/Tracy.hpp> |
| 8 | +#include <tracy/TracyCUDA.hpp> |
| 9 | + |
| 10 | +#include <cstdio> |
| 11 | +#include <cstdlib> |
| 12 | +#include <vector> |
| 13 | + |
| 14 | +#define CUDA_CHECK(call) \ |
| 15 | + do { \ |
| 16 | + cudaError_t err__ = (call); \ |
| 17 | + if (err__ != cudaSuccess) { \ |
| 18 | + std::fprintf(stderr, "CUDA error %s at %s:%d: %s\n", \ |
| 19 | + cudaGetErrorName(err__), __FILE__, __LINE__, \ |
| 20 | + cudaGetErrorString(err__)); \ |
| 21 | + std::exit(EXIT_FAILURE); \ |
| 22 | + } \ |
| 23 | + } while (0) |
| 24 | + |
| 25 | +__global__ void saxpy(float a, const float* x, float* y, int n) |
| 26 | +{ |
| 27 | + int i = blockIdx.x * blockDim.x + threadIdx.x; |
| 28 | + if (i < n) y[i] = a * x[i] + y[i]; |
| 29 | +} |
| 30 | + |
| 31 | +int main() |
| 32 | +{ |
| 33 | + // CUPTI-backed Tracy context. Auto-captures all CUDA activity from the |
| 34 | + // point StartProfiling() is called until StopProfiling(). The background |
| 35 | + // collector thread flushes activity into Tracy; the explicit Collect() |
| 36 | + // calls below just force a flush at known phase boundaries. |
| 37 | + auto* cudaCtx = TracyCUDAContext(); |
| 38 | + { |
| 39 | + constexpr char ctxName[] = "CUDA Graph Demo"; |
| 40 | + TracyCUDAContextName(cudaCtx, ctxName, sizeof(ctxName) - 1); |
| 41 | + } |
| 42 | + TracyCUDAStartProfiling(cudaCtx); |
| 43 | + |
| 44 | + constexpr int N = 1 << 16; // small N => kernel is short => launch overhead dominates |
| 45 | + constexpr int KERNELS_PER_GRAPH = 32; // chain length captured into the graph |
| 46 | + constexpr int OUTER_ITERS = 2000; // how many times we replay the chain |
| 47 | + |
| 48 | + // allocate device buffers |
| 49 | + float *dX = nullptr, *dY = nullptr; |
| 50 | + CUDA_CHECK(cudaMalloc(&dX, N * sizeof(float))); |
| 51 | + CUDA_CHECK(cudaMalloc(&dY, N * sizeof(float))); |
| 52 | + |
| 53 | + std::vector<float> hX(N, 1.0f); |
| 54 | + CUDA_CHECK(cudaMemcpy(dX, hX.data(), N * sizeof(float), cudaMemcpyHostToDevice)); |
| 55 | + |
| 56 | + cudaStream_t stream = nullptr; |
| 57 | + CUDA_CHECK(cudaStreamCreate(&stream)); |
| 58 | + |
| 59 | + const dim3 block(256); |
| 60 | + const dim3 grid((N + block.x - 1) / block.x); |
| 61 | + |
| 62 | + cudaEvent_t evStart, evStop; |
| 63 | + CUDA_CHECK(cudaEventCreate(&evStart)); |
| 64 | + CUDA_CHECK(cudaEventCreate(&evStop)); |
| 65 | + |
| 66 | + // warm-up (so first-launch lazy-init and/or JIT doesn't bias the measurement) |
| 67 | + saxpy<<<grid, block, 0, stream>>>(0.0f, dX, dY, N); |
| 68 | + CUDA_CHECK(cudaStreamSynchronize(stream)); |
| 69 | + |
| 70 | + // baseline: launch each kernel directly on the stream |
| 71 | + float msStream = 0.0f; |
| 72 | + { |
| 73 | + ZoneScopedN("stream-launches"); |
| 74 | + CUDA_CHECK(cudaMemsetAsync(dY, 0, N * sizeof(float), stream)); |
| 75 | + CUDA_CHECK(cudaEventRecord(evStart, stream)); |
| 76 | + for (int outer = 0; outer < OUTER_ITERS; ++outer) { |
| 77 | + for (int k = 0; k < KERNELS_PER_GRAPH; ++k) { |
| 78 | + saxpy<<<grid, block, 0, stream>>>(1.0e-6f, dX, dY, N); |
| 79 | + } |
| 80 | + } |
| 81 | + CUDA_CHECK(cudaEventRecord(evStop, stream)); |
| 82 | + CUDA_CHECK(cudaEventSynchronize(evStop)); |
| 83 | + CUDA_CHECK(cudaEventElapsedTime(&msStream, evStart, evStop)); |
| 84 | + TracyCUDACollect(cudaCtx); |
| 85 | + } |
| 86 | + |
| 87 | + // capture: record the same kernel chain into a graph |
| 88 | + cudaGraph_t graph = nullptr; |
| 89 | + cudaGraphExec_t graphExec = nullptr; |
| 90 | + { |
| 91 | + ZoneScopedN("graph-capture"); |
| 92 | + // cudaStreamCaptureModeRelaxed allows the calling thread to perform |
| 93 | + // unrelated CUDA work during capture; ThreadLocal is stricter if you need |
| 94 | + // isolation. Most short, single-stream captures work fine in either mode. |
| 95 | + CUDA_CHECK(cudaStreamBeginCapture(stream, cudaStreamCaptureModeRelaxed)); |
| 96 | + for (int k = 0; k < KERNELS_PER_GRAPH; ++k) { |
| 97 | + saxpy<<<grid, block, 0, stream>>>(1.0e-6f, dX, dY, N); |
| 98 | + } |
| 99 | + CUDA_CHECK(cudaStreamEndCapture(stream, &graph)); |
| 100 | + |
| 101 | + // Instantiate once -> reusable executable graph. |
| 102 | + CUDA_CHECK(cudaGraphInstantiate(&graphExec, graph, nullptr, nullptr, 0)); |
| 103 | + |
| 104 | + // The template graph isn't needed once instantiated. |
| 105 | + CUDA_CHECK(cudaGraphDestroy(graph)); |
| 106 | + } |
| 107 | + |
| 108 | + // replay: launch the instantiated graph OUTER_ITERS times |
| 109 | + float msGraph = 0.0f; |
| 110 | + { |
| 111 | + ZoneScopedN("graph-launches"); |
| 112 | + CUDA_CHECK(cudaMemsetAsync(dY, 0, N * sizeof(float), stream)); |
| 113 | + CUDA_CHECK(cudaEventRecord(evStart, stream)); |
| 114 | + for (int outer = 0; outer < OUTER_ITERS; ++outer) { |
| 115 | + CUDA_CHECK(cudaGraphLaunch(graphExec, stream)); |
| 116 | + } |
| 117 | + CUDA_CHECK(cudaEventRecord(evStop, stream)); |
| 118 | + CUDA_CHECK(cudaEventSynchronize(evStop)); |
| 119 | + CUDA_CHECK(cudaEventElapsedTime(&msGraph, evStart, evStop)); |
| 120 | + TracyCUDACollect(cudaCtx); |
| 121 | + } |
| 122 | + |
| 123 | + // sanity check: y[i] = OUTER_ITERS * KERNELS_PER_GRAPH * 1e-6 * x[i] |
| 124 | + std::vector<float> hY(N); |
| 125 | + CUDA_CHECK(cudaMemcpy(hY.data(), dY, N * sizeof(float), cudaMemcpyDeviceToHost)); |
| 126 | + const float expected = float(OUTER_ITERS) * float(KERNELS_PER_GRAPH) * 1.0e-6f; |
| 127 | + |
| 128 | + std::printf("Stream launches: %8.3f ms (%d kernels)\n", |
| 129 | + msStream, OUTER_ITERS * KERNELS_PER_GRAPH); |
| 130 | + std::printf("Graph launches: %8.3f ms (%d graph launches x %d kernels)\n", |
| 131 | + msGraph, OUTER_ITERS, KERNELS_PER_GRAPH); |
| 132 | + std::printf("Speedup : %8.2fx\n", msStream / msGraph); |
| 133 | + std::printf("hY[0] = %.6e (expected %.6e)\n", hY[0], expected); |
| 134 | + |
| 135 | + // shutdown |
| 136 | + CUDA_CHECK(cudaGraphExecDestroy(graphExec)); |
| 137 | + CUDA_CHECK(cudaEventDestroy(evStart)); |
| 138 | + CUDA_CHECK(cudaEventDestroy(evStop)); |
| 139 | + CUDA_CHECK(cudaStreamDestroy(stream)); |
| 140 | + CUDA_CHECK(cudaFree(dX)); |
| 141 | + CUDA_CHECK(cudaFree(dY)); |
| 142 | + |
| 143 | + TracyCUDAStopProfiling(cudaCtx); |
| 144 | + TracyCUDAContextDestroy(cudaCtx); |
| 145 | + return 0; |
| 146 | +} |
0 commit comments