diff --git a/src/cuda/CUDAStream.h b/src/cuda/CUDAStream.h index 50e099dc..b94e4e59 100644 --- a/src/cuda/CUDAStream.h +++ b/src/cuda/CUDAStream.h @@ -10,6 +10,7 @@ #include #include #include +#include #include "Stream.h" diff --git a/src/thrust/ThrustStream.cu b/src/thrust/ThrustStream.cu index 14ddd752..a7f7b161 100644 --- a/src/thrust/ThrustStream.cu +++ b/src/thrust/ThrustStream.cu @@ -1,15 +1,37 @@ -// Copyright (c) 2020 Tom Deakin -// University of Bristol HPC +// Copyright (c) 2020 Tom Deakin, 2025 Bernhard Manfred Gruber +// University of Bristol HPC, NVIDIA // // For full license terms please see the LICENSE file distributed with this // source code +// Thrust fails to compile if the following macro is defined: +#undef THRUST + #include "ThrustStream.h" #include #include #include #include +#if defined(MANAGED) +#include +#else +#include +#endif + +template +using vector = +#if defined(MANAGED) + thrust::universal_vector; +#else + thrust::device_vector; +#endif + +template +struct ThrustStream::Impl{ + vector a, b, c; +}; + static inline void synchronise() { // rocThrust doesn't synchronise between thrust calls @@ -19,9 +41,8 @@ static inline void synchronise() } template -ThrustStream::ThrustStream(BenchId bs, const intptr_t array_size, const int device, - T initA, T initB, T initC) - : array_size{array_size}, a(array_size), b(array_size), c(array_size) { +ThrustStream::ThrustStream(const intptr_t array_size, int device) + : array_size{array_size}, impl(new Impl{vector(array_size), vector(array_size), vector(array_size)}) { std::cout << "Using CUDA device: " << getDeviceName(device) << std::endl; std::cout << "Driver: " << getDeviceDriver(device) << std::endl; std::cout << "Thrust version: " << THRUST_VERSION << std::endl; @@ -50,39 +71,30 @@ ThrustStream::ThrustStream(BenchId bs, const intptr_t array_size, const int d init_arrays(initA, initB, initC); } +template +ThrustStream::~ThrustStream() = default; + template void ThrustStream::init_arrays(T initA, T initB, T initC) { - thrust::fill(a.begin(), a.end(), initA); - thrust::fill(b.begin(), b.end(), initB); - thrust::fill(c.begin(), c.end(), initC); + thrust::fill(impl->a.begin(), impl->a.end(), initA); + thrust::fill(impl->b.begin(), impl->b.end(), initB); + thrust::fill(impl->c.begin(), impl->c.end(), initC); synchronise(); } template -void ThrustStream::get_arrays(T const*& a_, T const*& b_, T const*& c_) -{ - #if defined(MANAGED) - a_ = &*a.data(); - b_ = &*b.data(); - c_ = &*c.data(); - #else - h_a.resize(array_size); - h_b.resize(array_size); - h_c.resize(array_size); - thrust::copy(a.begin(), a.end(), h_a.begin()); - thrust::copy(b.begin(), b.end(), h_b.begin()); - thrust::copy(c.begin(), c.end(), h_c.begin()); - a_ = h_a.data(); - b_ = h_b.data(); - c_ = h_c.data(); - #endif +void ThrustStream::read_arrays(std::vector& h_a, std::vector& h_b, std::vector& h_c) +{ + thrust::copy(impl->a.begin(), impl->a.end(), h_a.begin()); + thrust::copy(impl->b.begin(), impl->b.end(), h_b.begin()); + thrust::copy(impl->c.begin(), impl->c.end(), h_c.begin()); } template void ThrustStream::copy() { - thrust::copy(a.begin(), a.end(),c.begin()); + thrust::copy(impl->a.begin(), impl->a.end(),impl->c.begin()); synchronise(); } @@ -91,9 +103,9 @@ void ThrustStream::mul() { const T scalar = startScalar; thrust::transform( - c.begin(), - c.end(), - b.begin(), + impl->c.begin(), + impl->c.end(), + impl->b.begin(), [=] __device__ __host__ (const T &ci){ return ci * scalar; } @@ -105,10 +117,10 @@ template void ThrustStream::add() { thrust::transform( - a.begin(), - a.end(), - b.begin(), - c.begin(), + impl->a.begin(), + impl->a.end(), + impl->b.begin(), + impl->c.begin(), [] __device__ __host__ (const T& ai, const T& bi){ return ai + bi; } @@ -121,10 +133,10 @@ void ThrustStream::triad() { const T scalar = startScalar; thrust::transform( - b.begin(), - b.end(), - c.begin(), - a.begin(), + impl->b.begin(), + impl->b.end(), + impl->c.begin(), + impl->a.begin(), [=] __device__ __host__ (const T& bi, const T& ci){ return bi + scalar * ci; } @@ -137,9 +149,9 @@ void ThrustStream::nstream() { const T scalar = startScalar; thrust::transform( - thrust::make_zip_iterator(thrust::make_tuple(a.begin(), b.begin(), c.begin())), - thrust::make_zip_iterator(thrust::make_tuple(a.end(), b.end(), c.end())), - a.begin(), + thrust::make_zip_iterator(impl->a.begin(), impl->b.begin(), impl->c.begin()), + thrust::make_zip_iterator(impl->a.end(), impl->b.end(), impl->c.end()), + impl->a.begin(), thrust::make_zip_function( [=] __device__ __host__ (const T& ai, const T& bi, const T& ci){ return ai + bi + scalar * ci; @@ -151,7 +163,7 @@ void ThrustStream::nstream() template T ThrustStream::dot() { - return thrust::inner_product(a.begin(), a.end(), b.begin(), T{}); + return thrust::inner_product(impl->a.begin(), impl->a.end(), impl->b.begin(), T{}); } #if THRUST_DEVICE_SYSTEM == THRUST_DEVICE_SYSTEM_CUDA || \ @@ -167,7 +179,7 @@ T ThrustStream::dot() # error Unsupported compiler for Thrust #endif -void check_error(void) +void check_error() { IMPL_FN__(Error_t) err = IMPL_FN__(GetLastError()); if (err != IMPL_FN__(Success)) @@ -177,7 +189,7 @@ void check_error(void) } } -void listDevices(void) +void listDevices() { // Get number of devices int count; @@ -225,7 +237,7 @@ std::string getDeviceDriver(const int device) #else -void listDevices(void) +void listDevices() { std::cout << "0: CPU" << std::endl; } diff --git a/src/thrust/ThrustStream.h b/src/thrust/ThrustStream.h index 676ecaeb..4c4999fe 100644 --- a/src/thrust/ThrustStream.h +++ b/src/thrust/ThrustStream.h @@ -1,5 +1,5 @@ -// Copyright (c) 2020 Tom Deakin -// University of Bristol HPC +// Copyright (c) 2020 Tom Deakin, 2025 Bernhard Manfred Gruber +// University of Bristol HPC, NVIDIA // // For full license terms please see the LICENSE file distributed with this // source code @@ -8,11 +8,8 @@ #include #include -#if defined(MANAGED) -#include -#else -#include -#endif +#include +#include #include "Stream.h" @@ -22,20 +19,13 @@ template class ThrustStream : public Stream { protected: - // Size of arrays + struct Impl; + std::unique_ptr impl; // avoid thrust vectors leaking into non-CUDA translation units intptr_t array_size; - #if defined(MANAGED) - thrust::universal_vector a, b, c; - #else - thrust::device_vector a, b, c; - std::vector h_a, h_b, h_c; - #endif - public: - ThrustStream(BenchId bs, const intptr_t array_size, const int device_id, - T initA, T initB, T initC); - ~ThrustStream() = default; + ThrustStream(intptr_t array_size, int device); + ~ThrustStream(); void copy() override; void add() override; diff --git a/src/thrust/model.cmake b/src/thrust/model.cmake index 23627c11..f96ea6c5 100644 --- a/src/thrust/model.cmake +++ b/src/thrust/model.cmake @@ -1,13 +1,13 @@ register_flag_optional(THRUST_IMPL "Which Thrust implementation to use, supported options include: - - CUDA (via https://github.com/NVIDIA/thrust) + - CUDA (via https://github.com/NVIDIA/CCCL (CUDA Core Compute Libraries)) - ROCM (via https://github.com/ROCmSoftwarePlatform/rocThrust) " "CUDA") register_flag_optional(SDK_DIR - "Path to the selected Thrust implementation (e.g `/opt/nvidia/hpc_sdk/Linux_x86_64/21.9/cuda/include` for NVHPC, `/opt/rocm` for ROCm)" + "Path to the installation prefix for CCCL or Thrust (e.g `/opt/nvidia/hpc_sdk/Linux_x86_64/24.5/cuda/12.4/lib64/cmake` for NVHPC, or `/usr/local/cuda-13.0/lib64/cmake` for nvcc, or `/opt/rocm` for ROCm)" "") register_flag_optional(BACKEND @@ -18,7 +18,8 @@ register_flag_optional(BACKEND " "CUDA") -register_flag_optional(MANAGED "Enabled managed memory mode." "OFF") +register_flag_optional(MANAGED "Enabled managed memory mode." + "OFF") register_flag_optional(CMAKE_CUDA_COMPILER "[THRUST_IMPL==CUDA] Path to the CUDA nvcc compiler" @@ -33,52 +34,53 @@ register_flag_optional(CUDA_EXTRA_FLAGS "[THRUST_IMPL==CUDA] Additional CUDA flags passed to nvcc, this is appended after `CUDA_ARCH`" "") +option(FETCH_CCCL "Fetch (download) the CCCL library. This uses CMake's FetchContent feature. + Specify version by setting FETCH_CCCL_VERSION" OFF) +set(FETCH_CCCL_VERSION "v3.1.0" CACHE STRING "Specify version of CCCL to use if FETCH_CCCL is ON") macro(setup) - set(CMAKE_CXX_STANDARD 14) + set(CMAKE_CXX_STANDARD 17) if (MANAGED) register_definitions(MANAGED) endif () if (${THRUST_IMPL} STREQUAL "CUDA") - - # see CUDA.cmake, we're only adding a few Thrust related libraries here - if (POLICY CMP0104) cmake_policy(SET CMP0104 NEW) endif () - set(CMAKE_CUDA_ARCHITECTURES ${CUDA_ARCH}) - # add -forward-unknown-to-host-compiler for compatibility reasons set(CMAKE_CUDA_FLAGS ${CMAKE_CUDA_FLAGS} "--expt-extended-lambda " ${CUDA_EXTRA_FLAGS}) enable_language(CUDA) - # CMake defaults to -O2 for CUDA at Release, let's wipe that and use the global RELEASE_FLAG - # appended later + # CMake defaults to -O2 for CUDA at Release, let's wipe that and use the global RELEASE_FLAG appended later wipe_gcc_style_optimisation_flags(CMAKE_CUDA_FLAGS_${BUILD_TYPE}) message(STATUS "NVCC flags: ${CMAKE_CUDA_FLAGS} ${CMAKE_CUDA_FLAGS_${BUILD_TYPE}}") - - # XXX NVHPC <= 21.9 has cub-config in `Linux_x86_64/21.9/cuda/11.4/include/cub/cmake` - # XXX NVHPC >= 22.3 has cub-config in `Linux_x86_64/22.3/cuda/11.6/lib64/cmake/cub/` - # same thing for thrust + # append SDK_DIR to help finding CCCL if (SDK_DIR) + # CMake tries several subdirectories below SDK_DIR, see documentation: + # https://cmake.org/cmake/help/latest/command/find_package.html#config-mode-search-procedure list(APPEND CMAKE_PREFIX_PATH ${SDK_DIR}) - find_package(CUB REQUIRED CONFIG PATHS ${SDK_DIR}/cub) - find_package(Thrust REQUIRED CONFIG PATHS ${SDK_DIR}/thrust) - else () - find_package(CUB REQUIRED CONFIG) - find_package(Thrust REQUIRED CONFIG) endif () - message(STATUS "Using Thrust backend: ${BACKEND}") - - # this creates the interface that we can link to - thrust_create_target(Thrust${BACKEND} - HOST CPP - DEVICE ${BACKEND}) - - register_link_library(Thrust${BACKEND}) + # append CUDA Toolkit cmake config dir to help finding CCCL + find_package(CUDAToolkit REQUIRED) + list(APPEND CMAKE_PREFIX_PATH "${CUDAToolkit_LIBRARY_DIR}/cmake") + + set(CCCL_THRUST_DEVICE_SYSTEM ${BACKEND} CACHE STRING "" FORCE) + + # fetch CCCL if user wants to, otherwise just try to find it + if (FETCH_CCCL) + FetchContent_Declare( + CCCL + GIT_REPOSITORY https://github.com/nvidia/cccl.git + GIT_TAG "${FETCH_CCCL_VERSION}" + ) + FetchContent_MakeAvailable(CCCL) + else() + find_package(CCCL CONFIG REQUIRED) + endif() + register_link_library(CCCL::CCCL) elseif (${THRUST_IMPL} STREQUAL "ROCM") if (SDK_DIR) find_package(rocprim REQUIRED CONFIG PATHS ${SDK_DIR}/rocprim)