Skip to content

Commit 8c04e2c

Browse files
cole-browerbrycelelbach
authored andcommitted
add: notebook sources and support files
1 parent 41c6a0c commit 8c04e2c

File tree

73 files changed

+12983
-5
lines changed

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

73 files changed

+12983
-5
lines changed

tutorials/floating-point-emulation/brev/dockerfile

Lines changed: 17 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -1,23 +1,31 @@
11
FROM ubuntu:24.04 AS artifacts
22

33
COPY . /accelerated-computing-hub
4-
RUN find /accelerated-computing-hub/tutorials \
5-
-mindepth 1 -maxdepth 1 -type d -not -name "floating-point-emulation" \
6-
-exec rm -rf {} +
7-
RUN rm -rf /accelerated-computing-hub/.git
4+
# RUN find /accelerated-computing-hub/tutorials \
5+
# -mindepth 1 -maxdepth 1 -type d -not -name "floating-point-emulation" \
6+
# -exec rm -rf {} +
7+
# RUN rm -rf /accelerated-computing-hub/.git
88

99
FROM nvidia/cuda:13.1.0-base-ubuntu24.04
1010

1111
# Install CUDA Toolkit + build tools
1212
RUN apt update -y \
13-
&& apt install -y wget \
13+
&& apt install -y wget curl gnupg lsb-release \
1414
&& wget -O - https://apt.kitware.com/keys/kitware-archive-latest.asc 2>/dev/null | gpg --dearmor - | tee /usr/share/keyrings/kitware-archive-keyring.gpg >/dev/null \
1515
&& echo 'deb [signed-by=/usr/share/keyrings/kitware-archive-keyring.gpg] https://apt.kitware.com/ubuntu/ noble main' | tee /etc/apt/sources.list.d/kitware.list >/dev/null \
16+
&& curl -fsSL https://download.docker.com/linux/ubuntu/gpg | gpg --dearmor -o /etc/apt/keyrings/docker.gpg \
17+
&& echo "deb [arch=$(dpkg --print-architecture) signed-by=/etc/apt/keyrings/docker.gpg] https://download.docker.com/linux/ubuntu $(lsb_release -cs) stable" | tee /etc/apt/sources.list.d/docker.list > /dev/null \
1618
&& apt update -y \
1719
&& apt install -y cuda-nvrtc-13-1 cuda-cccl-13-1 libcublas-dev-13-1 \
1820
libnvjitlink-13-1 cuda-cudart-13-1 cuda-nvcc-13-1 libnvvm-13-1 \
1921
python-is-python3 python3-venv \
2022
build-essential cmake \
23+
git git-lfs \
24+
docker-ce \
25+
docker-ce-cli \
26+
containerd.io \
27+
docker-buildx-plugin \
28+
docker-compose-plugin \
2129
&& apt-get clean -y
2230

2331
# Install MathDx
@@ -60,4 +68,8 @@ COPY --from=artifacts /accelerated-computing-hub /accelerated-computing-hub
6068

6169
WORKDIR /accelerated-computing-hub/tutorials/${ACH_TUTORIAL}/notebooks
6270

71+
# Setup Git.
72+
RUN git config --unset-all "http.https://github.com/.extraheader" || { code=$?; [ "$code" = 5 ] || exit "$code"; } \
73+
&& git config --global --add safe.directory "/accelerated-computing-hub"
74+
6375
ENTRYPOINT ["/accelerated-computing-hub/brev/jupyter-start.bash"]
Lines changed: 67 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,67 @@
1+
# Global CXX flags/options
2+
set(CMAKE_CXX_STANDARD 17)
3+
set(CMAKE_CXX_STANDARD_REQUIRED ON)
4+
set(CMAKE_CXX_EXTENSIONS OFF)
5+
enable_testing()
6+
7+
LIST(APPEND CMAKE_PROGRAM_PATH "/usr/local/cuda-13.1/bin")
8+
9+
# Set default arguments
10+
set(TUTORIAL_CUDA_ARCHITECTURE "89" CACHE STRING "CUDA SM value with modifier, e.g. 89 or 100a")
11+
if (NOT CMAKE_BUILD_TYPE)
12+
set(CMAKE_BUILD_TYPE "Release" CACHE STRING "" FORCE)
13+
endif()
14+
15+
# Find cuBLASDx
16+
message(CHECK_START "Example Wrapper: Looking for MathDx package")
17+
find_package(mathdx REQUIRED CONFIG
18+
PATHS
19+
"/opt/nvidia/mathdx/25.12"
20+
)
21+
22+
find_package(CUDAToolkit REQUIRED)
23+
24+
if(NOT DEFINED TUTORIAL_CUDA_ARCHITECTURE OR TUTORIAL_CUDA_ARCHITECTURE STREQUAL "")
25+
message(FATAL_ERROR "You must set TUTORIAL_CUDA_ARCHITECTURE, e.g. -DTUTORIAL_CUDA_ARCHITECTURE=89 or -DTUTORIAL_CUDA_ARCHITECTURE=90a")
26+
endif()
27+
28+
if(NOT TUTORIAL_CUDA_ARCHITECTURE MATCHES "^[0-9]+[a-z]?$")
29+
message(FATAL_ERROR "TUTORIAL_CUDA_ARCHITECTURE must be of form sm[modifier], e.g. 89 or 100a")
30+
endif()
31+
32+
string(REGEX MATCH "^([0-9]+)([A-Za-z])?$" _match "${TUTORIAL_CUDA_ARCHITECTURE}")
33+
34+
set(TUTORIAL_SM "${CMAKE_MATCH_1}0")
35+
set(TUTORIAL_SM_LETTER "${CMAKE_MATCH_2}") # will be empty if no letter
36+
37+
if(TUTORIAL_SM_LETTER STREQUAL "")
38+
# Case: no letter
39+
set(TUTORIAL_SM_MODIFIER "cublasdx::generic")
40+
41+
elseif(TUTORIAL_SM_LETTER STREQUAL "a")
42+
# Case: letter 'a'
43+
set(TUTORIAL_SM_MODIFIER "cublasdx::arch_specific")
44+
45+
elseif(TUTORIAL_SM_LETTER STREQUAL "f")
46+
# Case: letter 'f'
47+
set(TUTORIAL_SM_MODIFIER "cublasdx::family_specific")
48+
49+
else()
50+
mesage(FATAL_ERROR "Unsupported SM modifier letter '${TUTORIAL_SM_LETTER}'. Allowed: empty, 'a', or 'f'.")
51+
endif()
52+
53+
set(CMAKE_CUDA_ARCHITECTURES "${TUTORIAL_CUDA_ARCHITECTURE}")
54+
55+
add_library(helpers INTERFACE)
56+
target_include_directories(helpers INTERFACE include/)
57+
58+
function(add_tutorial tutorial_name tutorial_file)
59+
add_executable("${tutorial_name}" "${tutorial_file}")
60+
add_test(NAME "${tutorial_name}" COMMAND "${tutorial_name}")
61+
target_compile_definitions("${tutorial_name}" PUBLIC SM_VALUE=${TUTORIAL_SM})
62+
target_compile_definitions("${tutorial_name}" PUBLIC SM_MODIFIER_VALUE=${TUTORIAL_SM_MODIFIER})
63+
target_link_libraries("${tutorial_name}" PRIVATE CUDA::cublas)
64+
target_link_libraries("${tutorial_name}" PRIVATE mathdx::cublasdx)
65+
target_link_libraries("${tutorial_name}" PRIVATE helpers)
66+
target_compile_options("${tutorial_name}" PRIVATE "--expt-relaxed-constexpr")
67+
endfunction()
Lines changed: 66 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,66 @@
1+
# Global CXX flags/options
2+
set(CMAKE_CXX_STANDARD 17)
3+
set(CMAKE_CXX_STANDARD_REQUIRED ON)
4+
set(CMAKE_CXX_EXTENSIONS OFF)
5+
enable_testing()
6+
7+
# Set default arguments
8+
set(TUTORIAL_CUDA_ARCHITECTURE "89" CACHE STRING "CUDA SM value with modifier, e.g. 89 or 100a")
9+
if (NOT CMAKE_BUILD_TYPE)
10+
set(CMAKE_BUILD_TYPE "Release" CACHE STRING "" FORCE)
11+
endif()
12+
13+
# Find cuBLASDx
14+
message(CHECK_START "Example Wrapper: Looking for MathDx package")
15+
find_package(mathdx REQUIRED CONFIG
16+
PATHS
17+
"/opt/nvidia/mathdx/25.12"
18+
)
19+
20+
find_package(CUDAToolkit REQUIRED)
21+
22+
if(NOT DEFINED TUTORIAL_CUDA_ARCHITECTURE OR TUTORIAL_CUDA_ARCHITECTURE STREQUAL "")
23+
message(FATAL_ERROR "You must set TUTORIAL_CUDA_ARCHITECTURE, e.g. -DTUTORIAL_CUDA_ARCHITECTURE=89 or -DTUTORIAL_CUDA_ARCHITECTURE=90a")
24+
endif()
25+
26+
if(NOT TUTORIAL_CUDA_ARCHITECTURE MATCHES "^[0-9]+[a-z]?$")
27+
message(FATAL_ERROR "TUTORIAL_CUDA_ARCHITECTURE must be of form sm[modifier], e.g. 89 or 100a")
28+
endif()
29+
30+
string(REGEX MATCH "^([0-9]+)([A-Za-z])?$" _match "${TUTORIAL_CUDA_ARCHITECTURE}")
31+
32+
set(TUTORIAL_SM "${CMAKE_MATCH_1}0")
33+
set(TUTORIAL_SM_LETTER "${CMAKE_MATCH_2}") # will be empty if no letter
34+
35+
if(TUTORIAL_SM_LETTER STREQUAL "")
36+
# Case: no letter
37+
set(TUTORIAL_SM_MODIFIER "cublasdx::generic")
38+
39+
elseif(TUTORIAL_SM_LETTER STREQUAL "a")
40+
# Case: letter 'a'
41+
set(TUTORIAL_SM_MODIFIER "cublasdx::arch_specific")
42+
43+
elseif(TUTORIAL_SM_LETTER STREQUAL "f")
44+
# Case: letter 'f'
45+
set(TUTORIAL_SM_MODIFIER "cublasdx::family_specific")
46+
47+
else()
48+
mesage(FATAL_ERROR "Unsupported SM modifier letter '${TUTORIAL_SM_LETTER}'. Allowed: empty, 'a', or 'f'.")
49+
endif()
50+
51+
set(CMAKE_CUDA_ARCHITECTURES "${TUTORIAL_CUDA_ARCHITECTURE}")
52+
53+
if(NOT TARGET tutorial_helpers)
54+
message( FATAL_ERROR "Please add tutorial_helpers library before including tutorial.cmake" )
55+
endif()
56+
57+
function(add_tutorial tutorial_name tutorial_file)
58+
add_executable("${tutorial_name}" "${tutorial_file}")
59+
add_test(NAME "${tutorial_name}" COMMAND "${tutorial_name}")
60+
target_compile_definitions("${tutorial_name}" PUBLIC SM_VALUE=${TUTORIAL_SM})
61+
target_compile_definitions("${tutorial_name}" PUBLIC SM_MODIFIER_VALUE=${TUTORIAL_SM_MODIFIER})
62+
target_link_libraries("${tutorial_name}" PRIVATE CUDA::cublas)
63+
target_link_libraries("${tutorial_name}" PRIVATE mathdx::cublasdx)
64+
target_link_libraries("${tutorial_name}" PRIVATE tutorial_helpers)
65+
target_compile_options("${tutorial_name}" PRIVATE "--expt-relaxed-constexpr")
66+
endfunction()
Lines changed: 20 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,20 @@
1+
cmake_minimum_required(VERSION 4.0)
2+
3+
LIST(APPEND CMAKE_PROGRAM_PATH "/usr/local/cuda-13.1/bin")
4+
project(cublasdx-dgemm-tutorial VERSION 0.1 LANGUAGES CUDA CXX)
5+
6+
# Add header tutorial helper files
7+
add_library(tutorial_helpers INTERFACE)
8+
target_include_directories(tutorial_helpers INTERFACE include/)
9+
10+
include(../cmake/common.cmake)
11+
12+
add_tutorial(1a_simple_dgemm_tensor src/1a_simple_dgemm_tensor.cu)
13+
add_tutorial(1b_simple_dgemm_shared src/1b_simple_dgemm_shared.cu)
14+
add_tutorial(1c_simple_dgemm_cublasdx src/1c_simple_dgemm_cublasdx.cu)
15+
add_tutorial(1d_simple_pipelined_dgemm src/1d_simple_pipelined_dgemm.cu)
16+
add_tutorial(2a_unfused_emulation src/2a_unfused_emulation/dgemm_emulation.cu)
17+
add_tutorial(2b_partially_fused_emulation src/2b_partially_fused_emulation/dgemm_emulation.cu)
18+
add_tutorial(2c_fully_fused_emulation src/2c_fully_fused_emulation/dgemm_emulation.cu)
19+
add_tutorial(3a_fused_syrk_emulation src/3a_fused_syrk_emulation/syrk_emulation.cu)
20+
Lines changed: 12 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,12 @@
1+
#pragma once
2+
3+
#ifndef CUDA_CHECK_AND_EXIT
4+
# define CUDA_CHECK_AND_EXIT(error) \
5+
{ \
6+
auto status = static_cast<cudaError_t>(error); \
7+
if (status != cudaSuccess) { \
8+
std::cout << cudaGetErrorString(status) << " " << __FILE__ << ":" << __LINE__ << std::endl; \
9+
std::exit(status); \
10+
} \
11+
}
12+
#endif
Lines changed: 81 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,81 @@
1+
#pragma once
2+
3+
namespace tutorial {
4+
5+
enum class matrix_half
6+
{
7+
lower,
8+
upper
9+
};
10+
11+
namespace detail {
12+
template<class T>
13+
struct is_complex_helper {
14+
static constexpr bool value = false;
15+
};
16+
17+
template<class T>
18+
struct is_complex_helper<cublasdx::complex<T>> {
19+
static constexpr bool value = true;
20+
};
21+
22+
template<class T>
23+
struct is_complex_helper<std::complex<T>> {
24+
static constexpr bool value = true;
25+
};
26+
27+
template<class T>
28+
struct is_complex_helper<cuda::std::complex<T>> {
29+
static constexpr bool value = true;
30+
};
31+
} // namespace detail
32+
33+
template<typename T>
34+
CUBLASDX_HOST_DEVICE constexpr bool is_complex() {
35+
return detail::is_complex_helper<T>::value;
36+
}
37+
38+
namespace detail {
39+
template<typename T>
40+
double cbabs(T v) {
41+
if constexpr (is_complex<T>()) {
42+
auto imag = std::abs(static_cast<double>(v.imag()));
43+
auto real = std::abs(static_cast<double>(v.real()));
44+
return (real + imag) / 2.0;
45+
} else {
46+
return std::abs(static_cast<double>(v));
47+
}
48+
}
49+
} // namespace detail
50+
51+
template<typename T1, typename T2>
52+
__host__ __device__ __forceinline__ constexpr T1 convert(T2 v) {
53+
constexpr bool is_output_complex = cublasdx::detail::has_complex_interface_v<T1>;
54+
constexpr bool is_input_complex = cublasdx::detail::has_complex_interface_v<T2>;
55+
if constexpr (is_input_complex and is_output_complex) {
56+
using t1_vt = typename T1::value_type;
57+
return T1(convert<t1_vt>(v.real()), convert<t1_vt>(v.imag()));
58+
} else if constexpr (is_output_complex) {
59+
using t1_vt = typename T1::value_type;
60+
return T1(convert<t1_vt>(v), convert<t1_vt>(v));
61+
} else if constexpr (is_input_complex) {
62+
return convert<T1>(v.real());
63+
} else if constexpr (COMMONDX_STL_NAMESPACE::is_convertible_v<T2, T1>) {
64+
return static_cast<T1>(v);
65+
} else if constexpr (COMMONDX_STL_NAMESPACE::is_constructible_v<T1, T2>) {
66+
return T1(v);
67+
} else {
68+
static_assert(COMMONDX_STL_NAMESPACE::is_convertible_v<T2, T1>,
69+
"Please provide your own conversion function");
70+
}
71+
}
72+
73+
template<typename T>
74+
struct converter {
75+
template<class V>
76+
CUBLASDX_HOST_DEVICE constexpr T operator()(V const& v) const {
77+
return convert<T>(v);
78+
}
79+
};
80+
81+
} // namespace tutorial
Lines changed: 50 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,50 @@
1+
#pragma once
2+
3+
#include "cuda_utilities.hpp"
4+
5+
namespace tutorial {
6+
7+
double real_gemm_tflops(unsigned m, unsigned n, unsigned k) {
8+
return (2. * m * n * k) / 1e9;
9+
}
10+
11+
double real_syrk_tflops(unsigned n, unsigned k) {
12+
double syrk_to_gemm_flop_ratio = ((n * (n + 1)) / 2.0) / static_cast<double>(n * n);
13+
return real_gemm_tflops(n, n, k) * syrk_to_gemm_flop_ratio;
14+
}
15+
16+
struct measure {
17+
// Returns execution time in ms.
18+
template<typename Kernel>
19+
static float execution(Kernel&& kernel,
20+
const unsigned int warm_up_runs,
21+
const unsigned int runs,
22+
cudaStream_t stream) {
23+
cudaEvent_t startEvent, stopEvent;
24+
CUDA_CHECK_AND_EXIT(cudaEventCreate(&startEvent));
25+
CUDA_CHECK_AND_EXIT(cudaEventCreate(&stopEvent));
26+
CUDA_CHECK_AND_EXIT(cudaDeviceSynchronize());
27+
28+
for (unsigned int i = 0; i < warm_up_runs; i++) {
29+
kernel(stream);
30+
}
31+
32+
CUDA_CHECK_AND_EXIT(cudaGetLastError());
33+
CUDA_CHECK_AND_EXIT(cudaDeviceSynchronize());
34+
35+
CUDA_CHECK_AND_EXIT(cudaEventRecord(startEvent, stream));
36+
for (unsigned int i = 0; i < runs; i++) {
37+
kernel(stream);
38+
}
39+
CUDA_CHECK_AND_EXIT(cudaEventRecord(stopEvent, stream));
40+
CUDA_CHECK_AND_EXIT(cudaDeviceSynchronize());
41+
42+
float time;
43+
CUDA_CHECK_AND_EXIT(cudaEventElapsedTime(&time, startEvent, stopEvent));
44+
CUDA_CHECK_AND_EXIT(cudaEventDestroy(startEvent));
45+
CUDA_CHECK_AND_EXIT(cudaEventDestroy(stopEvent));
46+
return time / runs;
47+
}
48+
};
49+
50+
} // namespace tutorial

0 commit comments

Comments
 (0)