NVIDIA
diff --git a/‎tutorials/floating-point-emulation/brev/dockerfile‎
Lines changed: 17 additions & 5 deletions b/‎tutorials/floating-point-emulation/brev/dockerfile‎
Lines changed: 17 additions & 5 deletions
diff --git a/‎tutorials/floating-point-emulation/cmake/common.cmake‎
Lines changed: 67 additions & 0 deletions b/‎tutorials/floating-point-emulation/cmake/common.cmake‎
Lines changed: 67 additions & 0 deletions
diff --git a/‎tutorials/floating-point-emulation/cmake/tutorial.cmake‎
Lines changed: 66 additions & 0 deletions b/‎tutorials/floating-point-emulation/cmake/tutorial.cmake‎
Lines changed: 66 additions & 0 deletions
diff --git a/‎tutorials/floating-point-emulation/cpp_source/CMakeLists.txt‎
Lines changed: 20 additions & 0 deletions b/‎tutorials/floating-point-emulation/cpp_source/CMakeLists.txt‎
Lines changed: 20 additions & 0 deletions
diff --git a/‎tutorials/floating-point-emulation/cpp_source/include/cuda_utilities.hpp‎
Lines changed: 12 additions & 0 deletions b/‎tutorials/floating-point-emulation/cpp_source/include/cuda_utilities.hpp‎
Lines changed: 12 additions & 0 deletions
diff --git a/‎tutorials/floating-point-emulation/cpp_source/include/numerical.hpp‎
Lines changed: 81 additions & 0 deletions b/‎tutorials/floating-point-emulation/cpp_source/include/numerical.hpp‎
Lines changed: 81 additions & 0 deletions
diff --git a/‎tutorials/floating-point-emulation/cpp_source/include/performance_measurement.hpp‎
Lines changed: 50 additions & 0 deletions b/‎tutorials/floating-point-emulation/cpp_source/include/performance_measurement.hpp‎
Lines changed: 50 additions & 0 deletions
@@ -1,23 +1,31 @@
 FROM ubuntu:24.04 AS artifacts
 
 COPY . /accelerated-computing-hub
-RUN find /accelerated-computing-hub/tutorials \
-    -mindepth 1 -maxdepth 1 -type d -not -name "floating-point-emulation" \
-    -exec rm -rf {} +
-RUN rm -rf /accelerated-computing-hub/.git
+# RUN find /accelerated-computing-hub/tutorials \
+#     -mindepth 1 -maxdepth 1 -type d -not -name "floating-point-emulation" \
+#     -exec rm -rf {} +
+# RUN rm -rf /accelerated-computing-hub/.git
 
 FROM nvidia/cuda:13.1.0-base-ubuntu24.04
 
 # Install CUDA Toolkit + build tools
 RUN apt update -y \
-    && apt install -y wget \
+    && apt install -y wget curl gnupg lsb-release \
     && wget -O - https://apt.kitware.com/keys/kitware-archive-latest.asc 2>/dev/null | gpg --dearmor - | tee /usr/share/keyrings/kitware-archive-keyring.gpg >/dev/null \
     && echo 'deb [signed-by=/usr/share/keyrings/kitware-archive-keyring.gpg] https://apt.kitware.com/ubuntu/ noble main' | tee /etc/apt/sources.list.d/kitware.list >/dev/null \
+    && curl -fsSL https://download.docker.com/linux/ubuntu/gpg | gpg --dearmor -o /etc/apt/keyrings/docker.gpg \
+    && echo "deb [arch=$(dpkg --print-architecture) signed-by=/etc/apt/keyrings/docker.gpg] https://download.docker.com/linux/ubuntu $(lsb_release -cs) stable" | tee /etc/apt/sources.list.d/docker.list > /dev/null \
     && apt update -y \
     && apt install -y cuda-nvrtc-13-1 cuda-cccl-13-1 libcublas-dev-13-1 \
     libnvjitlink-13-1 cuda-cudart-13-1 cuda-nvcc-13-1 libnvvm-13-1 \
     python-is-python3 python3-venv \
     build-essential cmake \
+    git git-lfs \
+    docker-ce \
+    docker-ce-cli \
+    containerd.io \
+    docker-buildx-plugin \
+    docker-compose-plugin \
     && apt-get clean -y
 
 # Install MathDx
@@ -60,4 +68,8 @@ COPY --from=artifacts /accelerated-computing-hub /accelerated-computing-hub
 
 WORKDIR /accelerated-computing-hub/tutorials/${ACH_TUTORIAL}/notebooks
 
+# Setup Git.
+RUN git config --unset-all "http.https://github.com/.extraheader" || { code=$?; [ "$code" = 5 ] || exit "$code"; } \
+ && git config --global --add safe.directory "/accelerated-computing-hub"
+
 ENTRYPOINT ["/accelerated-computing-hub/brev/jupyter-start.bash"]
@@ -0,0 +1,67 @@
+# Global CXX flags/options
+set(CMAKE_CXX_STANDARD 17)
+set(CMAKE_CXX_STANDARD_REQUIRED ON)
+set(CMAKE_CXX_EXTENSIONS OFF)
+enable_testing()
+
+LIST(APPEND CMAKE_PROGRAM_PATH  "/usr/local/cuda-13.1/bin")
+
+# Set default arguments
+set(TUTORIAL_CUDA_ARCHITECTURE "89" CACHE STRING "CUDA SM value with modifier, e.g. 89 or 100a")
+if (NOT CMAKE_BUILD_TYPE)
+    set(CMAKE_BUILD_TYPE "Release" CACHE STRING "" FORCE)
+endif()
+
+# Find cuBLASDx 
+message(CHECK_START "Example Wrapper: Looking for MathDx package")
+find_package(mathdx REQUIRED CONFIG
+ 	    PATHS
+		"/opt/nvidia/mathdx/25.12"
+)
+
+find_package(CUDAToolkit REQUIRED)
+
+if(NOT DEFINED TUTORIAL_CUDA_ARCHITECTURE OR TUTORIAL_CUDA_ARCHITECTURE STREQUAL "")
+	message(FATAL_ERROR "You must set TUTORIAL_CUDA_ARCHITECTURE, e.g. -DTUTORIAL_CUDA_ARCHITECTURE=89 or -DTUTORIAL_CUDA_ARCHITECTURE=90a")
+endif()
+
+if(NOT TUTORIAL_CUDA_ARCHITECTURE MATCHES "^[0-9]+[a-z]?$")
+	message(FATAL_ERROR "TUTORIAL_CUDA_ARCHITECTURE must be of form sm[modifier], e.g. 89 or 100a")
+endif()
+
+string(REGEX MATCH "^([0-9]+)([A-Za-z])?$" _match "${TUTORIAL_CUDA_ARCHITECTURE}")
+
+set(TUTORIAL_SM          "${CMAKE_MATCH_1}0")
+set(TUTORIAL_SM_LETTER   "${CMAKE_MATCH_2}")  # will be empty if no letter
+
+if(TUTORIAL_SM_LETTER STREQUAL "")
+    # Case: no letter
+    set(TUTORIAL_SM_MODIFIER "cublasdx::generic")
+
+elseif(TUTORIAL_SM_LETTER STREQUAL "a")
+    # Case: letter 'a'
+    set(TUTORIAL_SM_MODIFIER "cublasdx::arch_specific")
+
+elseif(TUTORIAL_SM_LETTER STREQUAL "f")
+    # Case: letter 'f'
+    set(TUTORIAL_SM_MODIFIER "cublasdx::family_specific")
+
+else()
+    mesage(FATAL_ERROR "Unsupported SM modifier letter '${TUTORIAL_SM_LETTER}'. Allowed: empty, 'a', or 'f'.")
+endif()
+
+set(CMAKE_CUDA_ARCHITECTURES "${TUTORIAL_CUDA_ARCHITECTURE}")
+
+add_library(helpers INTERFACE)
+target_include_directories(helpers INTERFACE include/)
+
+function(add_tutorial tutorial_name tutorial_file)
+    add_executable("${tutorial_name}" "${tutorial_file}")
+    add_test(NAME "${tutorial_name}" COMMAND "${tutorial_name}")
+    target_compile_definitions("${tutorial_name}" PUBLIC SM_VALUE=${TUTORIAL_SM})
+    target_compile_definitions("${tutorial_name}" PUBLIC SM_MODIFIER_VALUE=${TUTORIAL_SM_MODIFIER})
+    target_link_libraries("${tutorial_name}" PRIVATE CUDA::cublas)
+    target_link_libraries("${tutorial_name}" PRIVATE mathdx::cublasdx)
+    target_link_libraries("${tutorial_name}" PRIVATE helpers)
+    target_compile_options("${tutorial_name}" PRIVATE "--expt-relaxed-constexpr")
+endfunction()
@@ -0,0 +1,66 @@
+# Global CXX flags/options
+set(CMAKE_CXX_STANDARD 17)
+set(CMAKE_CXX_STANDARD_REQUIRED ON)
+set(CMAKE_CXX_EXTENSIONS OFF)
+enable_testing()
+
+# Set default arguments
+set(TUTORIAL_CUDA_ARCHITECTURE "89" CACHE STRING "CUDA SM value with modifier, e.g. 89 or 100a")
+if (NOT CMAKE_BUILD_TYPE)
+    set(CMAKE_BUILD_TYPE "Release" CACHE STRING "" FORCE)
+endif()
+
+# Find cuBLASDx 
+message(CHECK_START "Example Wrapper: Looking for MathDx package")
+find_package(mathdx REQUIRED CONFIG
+ 	    PATHS
+		"/opt/nvidia/mathdx/25.12"
+)
+
+find_package(CUDAToolkit REQUIRED)
+
+if(NOT DEFINED TUTORIAL_CUDA_ARCHITECTURE OR TUTORIAL_CUDA_ARCHITECTURE STREQUAL "")
+	message(FATAL_ERROR "You must set TUTORIAL_CUDA_ARCHITECTURE, e.g. -DTUTORIAL_CUDA_ARCHITECTURE=89 or -DTUTORIAL_CUDA_ARCHITECTURE=90a")
+endif()
+
+if(NOT TUTORIAL_CUDA_ARCHITECTURE MATCHES "^[0-9]+[a-z]?$")
+	message(FATAL_ERROR "TUTORIAL_CUDA_ARCHITECTURE must be of form sm[modifier], e.g. 89 or 100a")
+endif()
+
+string(REGEX MATCH "^([0-9]+)([A-Za-z])?$" _match "${TUTORIAL_CUDA_ARCHITECTURE}")
+
+set(TUTORIAL_SM          "${CMAKE_MATCH_1}0")
+set(TUTORIAL_SM_LETTER   "${CMAKE_MATCH_2}")  # will be empty if no letter
+
+if(TUTORIAL_SM_LETTER STREQUAL "")
+    # Case: no letter
+    set(TUTORIAL_SM_MODIFIER "cublasdx::generic")
+
+elseif(TUTORIAL_SM_LETTER STREQUAL "a")
+    # Case: letter 'a'
+    set(TUTORIAL_SM_MODIFIER "cublasdx::arch_specific")
+
+elseif(TUTORIAL_SM_LETTER STREQUAL "f")
+    # Case: letter 'f'
+    set(TUTORIAL_SM_MODIFIER "cublasdx::family_specific")
+
+else()
+    mesage(FATAL_ERROR "Unsupported SM modifier letter '${TUTORIAL_SM_LETTER}'. Allowed: empty, 'a', or 'f'.")
+endif()
+
+set(CMAKE_CUDA_ARCHITECTURES "${TUTORIAL_CUDA_ARCHITECTURE}")
+
+if(NOT TARGET tutorial_helpers)
+    message( FATAL_ERROR "Please add tutorial_helpers library before including tutorial.cmake" )
+endif()
+
+function(add_tutorial tutorial_name tutorial_file)
+    add_executable("${tutorial_name}" "${tutorial_file}")
+    add_test(NAME "${tutorial_name}" COMMAND "${tutorial_name}")
+    target_compile_definitions("${tutorial_name}" PUBLIC SM_VALUE=${TUTORIAL_SM})
+    target_compile_definitions("${tutorial_name}" PUBLIC SM_MODIFIER_VALUE=${TUTORIAL_SM_MODIFIER})
+    target_link_libraries("${tutorial_name}" PRIVATE CUDA::cublas)
+    target_link_libraries("${tutorial_name}" PRIVATE mathdx::cublasdx)
+    target_link_libraries("${tutorial_name}" PRIVATE tutorial_helpers)
+    target_compile_options("${tutorial_name}" PRIVATE "--expt-relaxed-constexpr")
+endfunction()
@@ -0,0 +1,20 @@
+cmake_minimum_required(VERSION 4.0)
+
+LIST(APPEND CMAKE_PROGRAM_PATH  "/usr/local/cuda-13.1/bin")
+project(cublasdx-dgemm-tutorial VERSION 0.1 LANGUAGES CUDA CXX)
+
+# Add header tutorial helper files
+add_library(tutorial_helpers INTERFACE)
+target_include_directories(tutorial_helpers INTERFACE include/)
+
+include(../cmake/common.cmake)
+
+add_tutorial(1a_simple_dgemm_tensor src/1a_simple_dgemm_tensor.cu)
+add_tutorial(1b_simple_dgemm_shared src/1b_simple_dgemm_shared.cu)
+add_tutorial(1c_simple_dgemm_cublasdx src/1c_simple_dgemm_cublasdx.cu)
+add_tutorial(1d_simple_pipelined_dgemm src/1d_simple_pipelined_dgemm.cu)
+add_tutorial(2a_unfused_emulation src/2a_unfused_emulation/dgemm_emulation.cu)
+add_tutorial(2b_partially_fused_emulation src/2b_partially_fused_emulation/dgemm_emulation.cu)
+add_tutorial(2c_fully_fused_emulation src/2c_fully_fused_emulation/dgemm_emulation.cu)
+add_tutorial(3a_fused_syrk_emulation src/3a_fused_syrk_emulation/syrk_emulation.cu)
+
@@ -0,0 +1,12 @@
+#pragma once
+
+#ifndef CUDA_CHECK_AND_EXIT
+#    define CUDA_CHECK_AND_EXIT(error)                                                                      \
+        {                                                                                                   \
+            auto status = static_cast<cudaError_t>(error);                                                  \
+            if (status != cudaSuccess) {                                                                    \
+                std::cout << cudaGetErrorString(status) << " " << __FILE__ << ":" << __LINE__ << std::endl; \
+                std::exit(status);                                                                          \
+            }                                                                                               \
+        }
+#endif
@@ -0,0 +1,81 @@
+#pragma once
+
+namespace tutorial {
+
+    enum class matrix_half
+    {
+        lower,
+        upper
+    };
+
+    namespace detail {
+        template<class T>
+        struct is_complex_helper {
+            static constexpr bool value = false;
+        };
+
+        template<class T>
+        struct is_complex_helper<cublasdx::complex<T>> {
+            static constexpr bool value = true;
+        };
+
+        template<class T>
+        struct is_complex_helper<std::complex<T>> {
+            static constexpr bool value = true;
+        };
+
+        template<class T>
+        struct is_complex_helper<cuda::std::complex<T>> {
+            static constexpr bool value = true;
+        };
+    } // namespace detail
+
+    template<typename T>
+    CUBLASDX_HOST_DEVICE constexpr bool is_complex() {
+        return detail::is_complex_helper<T>::value;
+    }
+
+    namespace detail {
+        template<typename T>
+        double cbabs(T v) {
+            if constexpr (is_complex<T>()) {
+                auto imag = std::abs(static_cast<double>(v.imag()));
+                auto real = std::abs(static_cast<double>(v.real()));
+                return (real + imag) / 2.0;
+            } else {
+                return std::abs(static_cast<double>(v));
+            }
+        }
+    } // namespace detail
+
+    template<typename T1, typename T2>
+    __host__ __device__ __forceinline__ constexpr T1 convert(T2 v) {
+        constexpr bool is_output_complex = cublasdx::detail::has_complex_interface_v<T1>;
+        constexpr bool is_input_complex  = cublasdx::detail::has_complex_interface_v<T2>;
+        if constexpr (is_input_complex and is_output_complex) {
+            using t1_vt = typename T1::value_type;
+            return T1(convert<t1_vt>(v.real()), convert<t1_vt>(v.imag()));
+        } else if constexpr (is_output_complex) {
+            using t1_vt = typename T1::value_type;
+            return T1(convert<t1_vt>(v), convert<t1_vt>(v));
+        } else if constexpr (is_input_complex) {
+            return convert<T1>(v.real());
+        } else if constexpr (COMMONDX_STL_NAMESPACE::is_convertible_v<T2, T1>) {
+            return static_cast<T1>(v);
+        } else if constexpr (COMMONDX_STL_NAMESPACE::is_constructible_v<T1, T2>) {
+            return T1(v);
+        } else {
+            static_assert(COMMONDX_STL_NAMESPACE::is_convertible_v<T2, T1>,
+                          "Please provide your own conversion function");
+        }
+    }
+
+    template<typename T>
+    struct converter {
+        template<class V>
+        CUBLASDX_HOST_DEVICE constexpr T operator()(V const& v) const {
+            return convert<T>(v);
+        }
+    };
+
+} // namespace tutorial
@@ -0,0 +1,50 @@
+#pragma once
+
+#include "cuda_utilities.hpp"
+
+namespace tutorial {
+
+    double real_gemm_tflops(unsigned m, unsigned n, unsigned k) {
+        return (2. * m * n * k) / 1e9;
+    }
+
+    double real_syrk_tflops(unsigned n, unsigned k) {
+        double syrk_to_gemm_flop_ratio = ((n * (n + 1)) / 2.0) / static_cast<double>(n * n);
+        return real_gemm_tflops(n, n, k) * syrk_to_gemm_flop_ratio;
+    }
+
+    struct measure {
+        // Returns execution time in ms.
+        template<typename Kernel>
+        static float execution(Kernel&&           kernel,
+                               const unsigned int warm_up_runs,
+                               const unsigned int runs,
+                               cudaStream_t       stream) {
+            cudaEvent_t startEvent, stopEvent;
+            CUDA_CHECK_AND_EXIT(cudaEventCreate(&startEvent));
+            CUDA_CHECK_AND_EXIT(cudaEventCreate(&stopEvent));
+            CUDA_CHECK_AND_EXIT(cudaDeviceSynchronize());
+
+            for (unsigned int i = 0; i < warm_up_runs; i++) {
+                kernel(stream);
+            }
+
+            CUDA_CHECK_AND_EXIT(cudaGetLastError());
+            CUDA_CHECK_AND_EXIT(cudaDeviceSynchronize());
+
+            CUDA_CHECK_AND_EXIT(cudaEventRecord(startEvent, stream));
+            for (unsigned int i = 0; i < runs; i++) {
+                kernel(stream);
+            }
+            CUDA_CHECK_AND_EXIT(cudaEventRecord(stopEvent, stream));
+            CUDA_CHECK_AND_EXIT(cudaDeviceSynchronize());
+
+            float time;
+            CUDA_CHECK_AND_EXIT(cudaEventElapsedTime(&time, startEvent, stopEvent));
+            CUDA_CHECK_AND_EXIT(cudaEventDestroy(startEvent));
+            CUDA_CHECK_AND_EXIT(cudaEventDestroy(stopEvent));
+            return time / runs;
+        }
+    };
+
+} // namespace tutorial