Skip to content

Commit 9b169ef

Browse files
authored
Merge pull request #1391 from wolfpld/slomp/cuda-examples
Adding CUDA examples
2 parents 7679779 + 64797dc commit 9b169ef

4 files changed

Lines changed: 196 additions & 0 deletions

File tree

examples/cuda/README.md

Whitespace-only changes.

examples/cuda/graph/CMakeLists.txt

Lines changed: 39 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,39 @@
1+
cmake_minimum_required(VERSION 3.18)
2+
project(CUDAGraphDemo LANGUAGES CXX CUDA)
3+
4+
set(CMAKE_CXX_STANDARD 17)
5+
set(CMAKE_CUDA_STANDARD 17)
6+
7+
if(CMAKE_VERSION VERSION_GREATER_EQUAL "3.24")
8+
set(CMAKE_CUDA_ARCHITECTURES native)
9+
endif()
10+
11+
set(TRACY_PATH "${CMAKE_CURRENT_SOURCE_DIR}/../../.."
12+
CACHE PATH "Root of the Tracy repository")
13+
set(TRACY_PUBLIC "${TRACY_PATH}/public")
14+
15+
find_package(CUDAToolkit REQUIRED)
16+
find_package(Threads REQUIRED)
17+
18+
# cuda-graph-demo.cu embeds Tracy via #include <TracyClient.cpp> (unity build),
19+
# so no separate TracyClient library is needed — just expose the public headers.
20+
add_executable(cuda-graph-demo cuda-graph-demo.cu)
21+
target_include_directories(cuda-graph-demo PRIVATE ${TRACY_PUBLIC})
22+
target_link_libraries(cuda-graph-demo PRIVATE
23+
CUDA::cupti CUDA::cuda_driver Threads::Threads ${CMAKE_DL_LIBS})
24+
25+
# ctest-related integration below
26+
# to run the binaries via ctest:
27+
# ctest --test-dir <cmake-build-dir> -R <binary-name> -C <build-config>
28+
29+
enable_testing()
30+
add_test(NAME cuda-graph-demo COMMAND cuda-graph-demo)
31+
32+
# On Windows, CUPTI's DLL must be on PATH at runtime.
33+
if(WIN32)
34+
set(_cupti_dir "$<TARGET_FILE_DIR:CUDA::cupti>")
35+
set_target_properties(cuda-graph-demo PROPERTIES
36+
VS_DEBUGGER_ENVIRONMENT "PATH=${_cupti_dir};$ENV{PATH}")
37+
set_tests_properties(cuda-graph-demo PROPERTIES
38+
ENVIRONMENT "PATH=${_cupti_dir};$ENV{PATH}")
39+
endif()

examples/cuda/graph/build.sh

Lines changed: 11 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,11 @@
1+
TRACY_PATH=<path-to-tracy>
2+
CUDA_TOOLKIT_PATH=/usr/local/cuda
3+
CUDA_CUPTI_PATH=${CUDA_TOOLKIT_PATH}/extras/CUPTI
4+
5+
# pass -v to nvcc for verbose build information
6+
nvcc -O2 -std=c++17 cuda-graph-demo.cu \
7+
-o cuda-graph-demo \
8+
-I "${TRACY_PATH}/public" \
9+
-I "${CUDA_CUPTI_PATH}/include" -I "${CUDA_TOOLKIT_PATH}/include" \
10+
-L "${CUDA_CUPTI_PATH}/lib64" -L "${CUDA_TOOLKIT_PATH}/lib64" \
11+
-lcupti -lcuda
Lines changed: 146 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,146 @@
1+
#include <cuda_runtime.h>
2+
3+
// WARN: for simplicity, we enable and "embed" the Tracy client directly into the code
4+
#define TRACY_ENABLE
5+
#include <TracyClient.cpp>
6+
7+
#include <tracy/Tracy.hpp>
8+
#include <tracy/TracyCUDA.hpp>
9+
10+
#include <cstdio>
11+
#include <cstdlib>
12+
#include <vector>
13+
14+
#define CUDA_CHECK(call) \
15+
do { \
16+
cudaError_t err__ = (call); \
17+
if (err__ != cudaSuccess) { \
18+
std::fprintf(stderr, "CUDA error %s at %s:%d: %s\n", \
19+
cudaGetErrorName(err__), __FILE__, __LINE__, \
20+
cudaGetErrorString(err__)); \
21+
std::exit(EXIT_FAILURE); \
22+
} \
23+
} while (0)
24+
25+
__global__ void saxpy(float a, const float* x, float* y, int n)
26+
{
27+
int i = blockIdx.x * blockDim.x + threadIdx.x;
28+
if (i < n) y[i] = a * x[i] + y[i];
29+
}
30+
31+
int main()
32+
{
33+
// CUPTI-backed Tracy context. Auto-captures all CUDA activity from the
34+
// point StartProfiling() is called until StopProfiling(). The background
35+
// collector thread flushes activity into Tracy; the explicit Collect()
36+
// calls below just force a flush at known phase boundaries.
37+
auto* cudaCtx = TracyCUDAContext();
38+
{
39+
constexpr char ctxName[] = "CUDA Graph Demo";
40+
TracyCUDAContextName(cudaCtx, ctxName, sizeof(ctxName) - 1);
41+
}
42+
TracyCUDAStartProfiling(cudaCtx);
43+
44+
constexpr int N = 1 << 16; // small N => kernel is short => launch overhead dominates
45+
constexpr int KERNELS_PER_GRAPH = 32; // chain length captured into the graph
46+
constexpr int OUTER_ITERS = 2000; // how many times we replay the chain
47+
48+
// allocate device buffers
49+
float *dX = nullptr, *dY = nullptr;
50+
CUDA_CHECK(cudaMalloc(&dX, N * sizeof(float)));
51+
CUDA_CHECK(cudaMalloc(&dY, N * sizeof(float)));
52+
53+
std::vector<float> hX(N, 1.0f);
54+
CUDA_CHECK(cudaMemcpy(dX, hX.data(), N * sizeof(float), cudaMemcpyHostToDevice));
55+
56+
cudaStream_t stream = nullptr;
57+
CUDA_CHECK(cudaStreamCreate(&stream));
58+
59+
const dim3 block(256);
60+
const dim3 grid((N + block.x - 1) / block.x);
61+
62+
cudaEvent_t evStart, evStop;
63+
CUDA_CHECK(cudaEventCreate(&evStart));
64+
CUDA_CHECK(cudaEventCreate(&evStop));
65+
66+
// warm-up (so first-launch lazy-init and/or JIT doesn't bias the measurement)
67+
saxpy<<<grid, block, 0, stream>>>(0.0f, dX, dY, N);
68+
CUDA_CHECK(cudaStreamSynchronize(stream));
69+
70+
// baseline: launch each kernel directly on the stream
71+
float msStream = 0.0f;
72+
{
73+
ZoneScopedN("stream-launches");
74+
CUDA_CHECK(cudaMemsetAsync(dY, 0, N * sizeof(float), stream));
75+
CUDA_CHECK(cudaEventRecord(evStart, stream));
76+
for (int outer = 0; outer < OUTER_ITERS; ++outer) {
77+
for (int k = 0; k < KERNELS_PER_GRAPH; ++k) {
78+
saxpy<<<grid, block, 0, stream>>>(1.0e-6f, dX, dY, N);
79+
}
80+
}
81+
CUDA_CHECK(cudaEventRecord(evStop, stream));
82+
CUDA_CHECK(cudaEventSynchronize(evStop));
83+
CUDA_CHECK(cudaEventElapsedTime(&msStream, evStart, evStop));
84+
TracyCUDACollect(cudaCtx);
85+
}
86+
87+
// capture: record the same kernel chain into a graph
88+
cudaGraph_t graph = nullptr;
89+
cudaGraphExec_t graphExec = nullptr;
90+
{
91+
ZoneScopedN("graph-capture");
92+
// cudaStreamCaptureModeRelaxed allows the calling thread to perform
93+
// unrelated CUDA work during capture; ThreadLocal is stricter if you need
94+
// isolation. Most short, single-stream captures work fine in either mode.
95+
CUDA_CHECK(cudaStreamBeginCapture(stream, cudaStreamCaptureModeRelaxed));
96+
for (int k = 0; k < KERNELS_PER_GRAPH; ++k) {
97+
saxpy<<<grid, block, 0, stream>>>(1.0e-6f, dX, dY, N);
98+
}
99+
CUDA_CHECK(cudaStreamEndCapture(stream, &graph));
100+
101+
// Instantiate once -> reusable executable graph.
102+
CUDA_CHECK(cudaGraphInstantiate(&graphExec, graph, nullptr, nullptr, 0));
103+
104+
// The template graph isn't needed once instantiated.
105+
CUDA_CHECK(cudaGraphDestroy(graph));
106+
}
107+
108+
// replay: launch the instantiated graph OUTER_ITERS times
109+
float msGraph = 0.0f;
110+
{
111+
ZoneScopedN("graph-launches");
112+
CUDA_CHECK(cudaMemsetAsync(dY, 0, N * sizeof(float), stream));
113+
CUDA_CHECK(cudaEventRecord(evStart, stream));
114+
for (int outer = 0; outer < OUTER_ITERS; ++outer) {
115+
CUDA_CHECK(cudaGraphLaunch(graphExec, stream));
116+
}
117+
CUDA_CHECK(cudaEventRecord(evStop, stream));
118+
CUDA_CHECK(cudaEventSynchronize(evStop));
119+
CUDA_CHECK(cudaEventElapsedTime(&msGraph, evStart, evStop));
120+
TracyCUDACollect(cudaCtx);
121+
}
122+
123+
// sanity check: y[i] = OUTER_ITERS * KERNELS_PER_GRAPH * 1e-6 * x[i]
124+
std::vector<float> hY(N);
125+
CUDA_CHECK(cudaMemcpy(hY.data(), dY, N * sizeof(float), cudaMemcpyDeviceToHost));
126+
const float expected = float(OUTER_ITERS) * float(KERNELS_PER_GRAPH) * 1.0e-6f;
127+
128+
std::printf("Stream launches: %8.3f ms (%d kernels)\n",
129+
msStream, OUTER_ITERS * KERNELS_PER_GRAPH);
130+
std::printf("Graph launches: %8.3f ms (%d graph launches x %d kernels)\n",
131+
msGraph, OUTER_ITERS, KERNELS_PER_GRAPH);
132+
std::printf("Speedup : %8.2fx\n", msStream / msGraph);
133+
std::printf("hY[0] = %.6e (expected %.6e)\n", hY[0], expected);
134+
135+
// shutdown
136+
CUDA_CHECK(cudaGraphExecDestroy(graphExec));
137+
CUDA_CHECK(cudaEventDestroy(evStart));
138+
CUDA_CHECK(cudaEventDestroy(evStop));
139+
CUDA_CHECK(cudaStreamDestroy(stream));
140+
CUDA_CHECK(cudaFree(dX));
141+
CUDA_CHECK(cudaFree(dY));
142+
143+
TracyCUDAStopProfiling(cudaCtx);
144+
TracyCUDAContextDestroy(cudaCtx);
145+
return 0;
146+
}

0 commit comments

Comments
 (0)