diff --git a/libcudacxx/codegen/CMakeLists.txt b/libcudacxx/codegen/CMakeLists.txt index 4e95fdbedcf..b64d6a47d3d 100644 --- a/libcudacxx/codegen/CMakeLists.txt +++ b/libcudacxx/codegen/CMakeLists.txt @@ -8,14 +8,12 @@ add_executable(codegen EXCLUDE_FROM_ALL codegen.cpp) target_compile_features(codegen PRIVATE cxx_std_20) -set( - atomic_generated_output - "${libcudacxx_BINARY_DIR}/codegen/cuda_ptx_generated.h" -) -set( - atomic_install_location - "${libcudacxx_SOURCE_DIR}/include/cuda/std/__atomic/functions" -) +set(atomic_generated_output "${libcudacxx_BINARY_DIR}/codegen/cuda_ptx_generated.h") +set(atomic_install_location "${libcudacxx_SOURCE_DIR}/include/cuda/std/__atomic/backends/") + +target_link_libraries(codegen PRIVATE fmt) + +set_property(TARGET codegen PROPERTY CXX_STANDARD 17) add_custom_target( libcudacxx.atomics.codegen diff --git a/libcudacxx/codegen/generators/header.h b/libcudacxx/codegen/generators/header.h index 41a9b5cdf72..4a97dfd2755 100644 --- a/libcudacxx/codegen/generators/header.h +++ b/libcudacxx/codegen/generators/header.h @@ -28,8 +28,8 @@ inline void FormatHeader(std::ostream& out) // This is an autogenerated file, we want to ensure that it contains exactly the contents we want to generate // clang-format off -#ifndef _CUDA_STD___ATOMIC_FUNCTIONS_CUDA_PTX_GENERATED_H -#define _CUDA_STD___ATOMIC_FUNCTIONS_CUDA_PTX_GENERATED_H +#ifndef _CUDA_STD___ATOMIC_BACKENDS_CUDA_PTX_GENERATED_H +#define _CUDA_STD___ATOMIC_BACKENDS_CUDA_PTX_GENERATED_H #include @@ -50,9 +50,9 @@ inline void FormatHeader(std::ostream& out) #include #include -#include -#include -#include +#include +#include +#include #include @@ -77,7 +77,7 @@ _CCCL_END_NAMESPACE_CUDA_STD #include -#endif // _CUDA_STD___ATOMIC_FUNCTIONS_CUDA_PTX_GENERATED_H +#endif // _CUDA_STD___ATOMIC_BACKENDS_CUDA_PTX_GENERATED_H // clang-format on )XXX"; diff --git a/libcudacxx/codegen/generators/ld_st.h b/libcudacxx/codegen/generators/ld_st.h index d836d80f78a..b42ebd94eec 100644 --- a/libcudacxx/codegen/generators/ld_st.h +++ b/libcudacxx/codegen/generators/ld_st.h @@ -391,7 +391,7 @@ static inline _CCCL_DEVICE void __atomic_store_cuda(_Type* __ptr, _Type& __val, __cuda_atomic_store_memory_order_dispatch(__bound_store, __memorder, _Sco{}); } template -static inline _CCCL_DEVICE void __atomic_store_cuda(volatile _Type* __ptr, _Type& __val, int __memorder, _Sco) +static inline _CCCL_DEVICE void __atomic_store_cuda(_Type volatile* __ptr, _Type& __val, int __memorder, _Sco) { using __proxy_t = typename __atomic_cuda_deduce_bitwise<_Type>::__type; using __proxy_tag = typename __atomic_cuda_deduce_bitwise<_Type>::__tag; diff --git a/libcudacxx/include/cuda/std/__atomic/functions.h b/libcudacxx/include/cuda/std/__atomic/backends.h similarity index 68% rename from libcudacxx/include/cuda/std/__atomic/functions.h rename to libcudacxx/include/cuda/std/__atomic/backends.h index b8de1c70176..96fff6a85b3 100644 --- a/libcudacxx/include/cuda/std/__atomic/functions.h +++ b/libcudacxx/include/cuda/std/__atomic/backends.h @@ -8,8 +8,8 @@ // //===----------------------------------------------------------------------===// -#ifndef __CUDA_STD___ATOMIC_FUNCTIONS_H -#define __CUDA_STD___ATOMIC_FUNCTIONS_H +#ifndef __CUDA_STD___ATOMIC_BACKENDS_H +#define __CUDA_STD___ATOMIC_BACKENDS_H #include @@ -22,12 +22,15 @@ #endif // no system header #include - +#if _CCCL_CUDA_COMPILER(NVCC, >, 12, 8) +# include +#else // Device atomics -#include -#include +# include +#endif +#include // Host atomics -#include +#include -#endif // __CUDA_STD___ATOMIC_FUNCTIONS_H +#endif // __CUDA_STD___ATOMIC_BACKENDS_H diff --git a/libcudacxx/include/cuda/std/__atomic/functions/common.h b/libcudacxx/include/cuda/std/__atomic/backends/common.h similarity index 91% rename from libcudacxx/include/cuda/std/__atomic/functions/common.h rename to libcudacxx/include/cuda/std/__atomic/backends/common.h index 07ece7c4abe..7b097e3ef59 100644 --- a/libcudacxx/include/cuda/std/__atomic/functions/common.h +++ b/libcudacxx/include/cuda/std/__atomic/backends/common.h @@ -8,8 +8,8 @@ // //===----------------------------------------------------------------------===// -#ifndef _CUDA_STD___ATOMIC_FUNCTIONS_COMMON_H -#define _CUDA_STD___ATOMIC_FUNCTIONS_COMMON_H +#ifndef _CUDA_STD___ATOMIC_BACKENDS_COMMON_H +#define _CUDA_STD___ATOMIC_BACKENDS_COMMON_H #include @@ -55,4 +55,4 @@ _CCCL_END_NAMESPACE_CUDA_STD #include -#endif // _CUDA_STD___ATOMIC_FUNCTIONS_COMMON_H +#endif // _CUDA_STD___ATOMIC_BACKENDS_COMMON_H diff --git a/libcudacxx/include/cuda/std/__atomic/functions/cuda_local.h b/libcudacxx/include/cuda/std/__atomic/backends/cuda_local.h similarity index 97% rename from libcudacxx/include/cuda/std/__atomic/functions/cuda_local.h rename to libcudacxx/include/cuda/std/__atomic/backends/cuda_local.h index c6b07cfbf61..9ea7c1d3173 100644 --- a/libcudacxx/include/cuda/std/__atomic/functions/cuda_local.h +++ b/libcudacxx/include/cuda/std/__atomic/backends/cuda_local.h @@ -7,8 +7,8 @@ // SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. // //===----------------------------------------------------------------------===// -#ifndef __CUDA_STD___ATOMIC_FUNCTIONS_CUDA_LOCAL_H -#define __CUDA_STD___ATOMIC_FUNCTIONS_CUDA_LOCAL_H +#ifndef __CUDA_STD___ATOMIC_BACKENDS_CUDA_LOCAL_H +#define __CUDA_STD___ATOMIC_BACKENDS_CUDA_LOCAL_H #include @@ -205,4 +205,4 @@ _CCCL_END_NAMESPACE_CUDA_STD #include -#endif // __CUDA_STD___ATOMIC_FUNCTIONS_CUDA_LOCAL_H +#endif // __CUDA_STD___ATOMIC_BACKENDS_CUDA_LOCAL_H diff --git a/libcudacxx/include/cuda/std/__atomic/backends/cuda_nvvm.h b/libcudacxx/include/cuda/std/__atomic/backends/cuda_nvvm.h new file mode 100644 index 00000000000..3c096de7317 --- /dev/null +++ b/libcudacxx/include/cuda/std/__atomic/backends/cuda_nvvm.h @@ -0,0 +1,421 @@ +//===----------------------------------------------------------------------===// +// +// Part of libcu++, the C++ Standard Library for your entire system, +// under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. +// +//===----------------------------------------------------------------------===// + +#ifndef _CUDA_STD___ATOMIC_BACKENDS_CUDA_NVVM_H +# define _CUDA_STD___ATOMIC_BACKENDS_CUDA_NVVM_H + +# include + +# if defined(_CCCL_IMPLICIT_SYSTEM_HEADER_GCC) +# pragma GCC system_header +# elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_CLANG) +# pragma clang system_header +# elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_MSVC) +# pragma system_header +# endif // no system header + +# include +# include +# include +# include +# include +# include +# include +# include +# include +# include +# include + +# include + +_CCCL_BEGIN_NAMESPACE_CUDA_STD + +# if _CCCL_CUDA_COMPILATION() + +constexpr _CCCL_DEVICE int __atomic_scope_tag_to_nvvm_scope(::cuda::std::__thread_scope_system_tag) +{ + return ::__NV_THREAD_SCOPE_SYSTEM; +} +constexpr _CCCL_DEVICE int __atomic_scope_tag_to_nvvm_scope(::cuda::std::__thread_scope_device_tag) +{ + return ::__NV_THREAD_SCOPE_DEVICE; +} +constexpr _CCCL_DEVICE int __atomic_scope_tag_to_nvvm_scope(::cuda::std::__thread_scope_cluster_tag) +{ + return ::__NV_THREAD_SCOPE_CLUSTER; +} +constexpr _CCCL_DEVICE int __atomic_scope_tag_to_nvvm_scope(::cuda::std::__thread_scope_block_tag) +{ + return ::__NV_THREAD_SCOPE_BLOCK; +} +constexpr _CCCL_DEVICE int __atomic_scope_tag_to_nvvm_scope(::cuda::std::__thread_scope_thread_tag) +{ + return ::__NV_THREAD_SCOPE_THREAD; +} + +template +static inline _CCCL_DEVICE void __atomic_thread_fence_cuda(int __memorder, _Sco) +{ + __atomic_thread_fence_nvvm_dispatch(__memorder, __atomic_scope_tag_to_nvvm_scope(_Sco{})); +} + +template +static inline _CCCL_DEVICE void __atomic_load_cuda(const _Type* __ptr, _Type& __dst, int __memorder, _Sco) +{ + using __proxy_t = typename __atomic_cuda_deduce_bitwise<_Type>::__type; + using __proxy_tag = typename __atomic_cuda_deduce_bitwise<_Type>::__tag; + const __proxy_t* __ptr_proxy = reinterpret_cast(__ptr); + __proxy_t* __dst_proxy = reinterpret_cast<__proxy_t*>(&__dst); + if (__cuda_load_weak_if_local(__ptr_proxy, __dst_proxy, sizeof(__proxy_t))) + { + return; + } + __atomic_load_nvvm_dispatch(__ptr_proxy, __dst_proxy, __memorder, __atomic_scope_tag_to_nvvm_scope(_Sco{})); +} +template +static inline _CCCL_DEVICE void __atomic_load_cuda(const _Type volatile* __ptr, _Type& __dst, int __memorder, _Sco) +{ + using __proxy_t = typename __atomic_cuda_deduce_bitwise<_Type>::__type; + using __proxy_tag = typename __atomic_cuda_deduce_bitwise<_Type>::__tag; + const __proxy_t* __ptr_proxy = reinterpret_cast(const_cast<_Type*>(__ptr)); + __proxy_t* __dst_proxy = reinterpret_cast<__proxy_t*>(&__dst); + if (__cuda_load_weak_if_local(__ptr_proxy, __dst_proxy, sizeof(__proxy_t))) + { + return; + } + __atomic_load_nvvm_dispatch(__ptr_proxy, __dst_proxy, __memorder, __atomic_scope_tag_to_nvvm_scope(_Sco{})); +} + +template +static inline _CCCL_DEVICE void __atomic_store_cuda(_Type* __ptr, _Type& __val, int __memorder, _Sco) +{ + using __proxy_t = typename __atomic_cuda_deduce_bitwise<_Type>::__type; + using __proxy_tag = typename __atomic_cuda_deduce_bitwise<_Type>::__tag; + __proxy_t* __ptr_proxy = reinterpret_cast<__proxy_t*>(__ptr); + __proxy_t* __val_proxy = reinterpret_cast<__proxy_t*>(&__val); + if (__cuda_store_weak_if_local(__ptr_proxy, __val_proxy, sizeof(__proxy_t))) + { + return; + } + __atomic_store_nvvm_dispatch(__ptr_proxy, __val_proxy, __memorder, __atomic_scope_tag_to_nvvm_scope(_Sco{})); +} +template +static inline _CCCL_DEVICE void __atomic_store_cuda(_Type volatile* __ptr, _Type& __val, int __memorder, _Sco) +{ + using __proxy_t = typename __atomic_cuda_deduce_bitwise<_Type>::__type; + using __proxy_tag = typename __atomic_cuda_deduce_bitwise<_Type>::__tag; + __proxy_t* __ptr_proxy = reinterpret_cast<__proxy_t*>(const_cast<_Type*>(__ptr)); + __proxy_t* __val_proxy = reinterpret_cast<__proxy_t*>(&__val); + if (__cuda_store_weak_if_local(__ptr_proxy, __val_proxy, sizeof(__proxy_t))) + { + return; + } + __atomic_store_nvvm_dispatch(__ptr_proxy, __val_proxy, __memorder, __atomic_scope_tag_to_nvvm_scope(_Sco{})); +} + +template +static inline _CCCL_DEVICE bool __atomic_compare_exchange_cuda( + _Type* __ptr, _Type* __exp, _Type __des, bool __weak, int __success_memorder, int __failure_memorder, _Sco) +{ + using __proxy_t = typename __atomic_cuda_deduce_bitwise<_Type>::__type; + using __proxy_tag = typename __atomic_cuda_deduce_bitwise<_Type>::__tag; + __proxy_t* __ptr_proxy = reinterpret_cast<__proxy_t*>(__ptr); + __proxy_t* __exp_proxy = reinterpret_cast<__proxy_t*>(__exp); + __proxy_t* __des_proxy = reinterpret_cast<__proxy_t*>(&__des); + bool __res = false; + if (__cuda_compare_exchange_weak_if_local(__ptr_proxy, __exp_proxy, __des_proxy, &__res)) + { + return __res; + } + return __atomic_compare_exchange_nvvm_dispatch( + __ptr_proxy, + __exp_proxy, + __des_proxy, + __weak, + __success_memorder, + __failure_memorder, + __atomic_scope_tag_to_nvvm_scope(_Sco{})); +} +template +static inline _CCCL_DEVICE bool __atomic_compare_exchange_cuda( + _Type volatile* __ptr, _Type* __exp, _Type __des, bool __weak, int __success_memorder, int __failure_memorder, _Sco) +{ + using __proxy_t = typename __atomic_cuda_deduce_bitwise<_Type>::__type; + using __proxy_tag = typename __atomic_cuda_deduce_bitwise<_Type>::__tag; + __proxy_t* __ptr_proxy = reinterpret_cast<__proxy_t*>(const_cast<_Type*>(__ptr)); + __proxy_t* __exp_proxy = reinterpret_cast<__proxy_t*>(__exp); + __proxy_t* __des_proxy = reinterpret_cast<__proxy_t*>(&__des); + bool __res = false; + if (__cuda_compare_exchange_weak_if_local(__ptr_proxy, __exp_proxy, __des_proxy, &__res)) + { + return __res; + } + return __atomic_compare_exchange_nvvm_dispatch( + __ptr_proxy, + __exp_proxy, + __des_proxy, + __weak, + __success_memorder, + __failure_memorder, + __atomic_scope_tag_to_nvvm_scope(_Sco{})); +} + +template +static inline _CCCL_DEVICE void __atomic_exchange_cuda(_Type* __ptr, _Type& __old, _Type __new, int __memorder, _Sco) +{ + using __proxy_t = typename __atomic_cuda_deduce_bitwise<_Type>::__type; + using __proxy_tag = typename __atomic_cuda_deduce_bitwise<_Type>::__tag; + __proxy_t* __ptr_proxy = reinterpret_cast<__proxy_t*>(__ptr); + __proxy_t* __old_proxy = reinterpret_cast<__proxy_t*>(&__old); + __proxy_t* __new_proxy = reinterpret_cast<__proxy_t*>(&__new); + if (__cuda_exchange_weak_if_local(__ptr_proxy, __new_proxy, __old_proxy)) + { + return; + } + __atomic_exchange_nvvm_dispatch( + __ptr_proxy, __new_proxy, __old_proxy, __memorder, __atomic_scope_tag_to_nvvm_scope(_Sco{})); +} +template +static inline _CCCL_DEVICE void +__atomic_exchange_cuda(_Type volatile* __ptr, _Type& __old, _Type __new, int __memorder, _Sco) +{ + using __proxy_t = typename __atomic_cuda_deduce_bitwise<_Type>::__type; + using __proxy_tag = typename __atomic_cuda_deduce_bitwise<_Type>::__tag; + __proxy_t* __ptr_proxy = reinterpret_cast<__proxy_t*>(const_cast<_Type*>(__ptr)); + __proxy_t* __old_proxy = reinterpret_cast<__proxy_t*>(&__old); + __proxy_t* __new_proxy = reinterpret_cast<__proxy_t*>(&__new); + if (__cuda_exchange_weak_if_local(__ptr_proxy, __new_proxy, __old_proxy)) + { + return; + } + __atomic_exchange_nvvm_dispatch( + __ptr_proxy, __new_proxy, __old_proxy, __memorder, __atomic_scope_tag_to_nvvm_scope(_Sco{})); +} + +template = 0> +static inline _CCCL_DEVICE _Type __atomic_fetch_and_cuda(_Type* __ptr, _Up __op, int __memorder, _Sco) +{ + using __proxy_t = typename __atomic_cuda_deduce_bitwise<_Type>::__type; + using __proxy_tag = typename __atomic_cuda_deduce_bitwise<_Type>::__tag; + _Type __dst{}; + __proxy_t* __ptr_proxy = reinterpret_cast<__proxy_t*>(__ptr); + __proxy_t* __dst_proxy = reinterpret_cast<__proxy_t*>(&__dst); + __proxy_t* __op_proxy = reinterpret_cast<__proxy_t*>(&__op); + if (__cuda_fetch_and_weak_if_local(__ptr_proxy, *__op_proxy, __dst_proxy)) + { + return __dst; + } + return __atomic_fetch_and_nvvm_dispatch( + __ptr_proxy, *__op_proxy, __memorder, __atomic_scope_tag_to_nvvm_scope(_Sco{})); +} +template = 0> +static inline _CCCL_DEVICE _Type __atomic_fetch_and_cuda(_Type volatile* __ptr, _Up __op, int __memorder, _Sco) +{ + using __proxy_t = typename __atomic_cuda_deduce_bitwise<_Type>::__type; + using __proxy_tag = typename __atomic_cuda_deduce_bitwise<_Type>::__tag; + _Type __dst{}; + __proxy_t* __ptr_proxy = reinterpret_cast<__proxy_t*>(const_cast<_Type*>(__ptr)); + __proxy_t* __dst_proxy = reinterpret_cast<__proxy_t*>(&__dst); + __proxy_t* __op_proxy = reinterpret_cast<__proxy_t*>(&__op); + if (__cuda_fetch_and_weak_if_local(__ptr_proxy, *__op_proxy, __dst_proxy)) + { + return __dst; + } + return __atomic_fetch_and_nvvm_dispatch( + __ptr_proxy, *__op_proxy, __memorder, __atomic_scope_tag_to_nvvm_scope(_Sco{})); +} + +template = 0> +static inline _CCCL_DEVICE _Type __atomic_fetch_max_cuda(_Type* __ptr, _Up __op, int __memorder, _Sco) +{ + using __proxy_t = typename __atomic_cuda_deduce_minmax<_Type>::__type; + using __proxy_tag = typename __atomic_cuda_deduce_minmax<_Type>::__tag; + _Type __dst{}; + __proxy_t* __ptr_proxy = reinterpret_cast<__proxy_t*>(__ptr); + __proxy_t* __dst_proxy = reinterpret_cast<__proxy_t*>(&__dst); + __proxy_t* __op_proxy = reinterpret_cast<__proxy_t*>(&__op); + if (__cuda_fetch_max_weak_if_local(__ptr_proxy, *__op_proxy, __dst_proxy)) + { + return __dst; + } + return __atomic_fetch_max_nvvm_dispatch( + __ptr_proxy, *__op_proxy, __memorder, __atomic_scope_tag_to_nvvm_scope(_Sco{})); +} +template = 0> +static inline _CCCL_DEVICE _Type __atomic_fetch_max_cuda(_Type volatile* __ptr, _Up __op, int __memorder, _Sco) +{ + using __proxy_t = typename __atomic_cuda_deduce_minmax<_Type>::__type; + using __proxy_tag = typename __atomic_cuda_deduce_minmax<_Type>::__tag; + _Type __dst{}; + __proxy_t* __ptr_proxy = reinterpret_cast<__proxy_t*>(const_cast<_Type*>(__ptr)); + __proxy_t* __dst_proxy = reinterpret_cast<__proxy_t*>(&__dst); + __proxy_t* __op_proxy = reinterpret_cast<__proxy_t*>(&__op); + if (__cuda_fetch_max_weak_if_local(__ptr_proxy, *__op_proxy, __dst_proxy)) + { + return __dst; + } + return __atomic_fetch_max_nvvm_dispatch( + __ptr_proxy, *__op_proxy, __memorder, __atomic_scope_tag_to_nvvm_scope(_Sco{})); +} + +template = 0> +static inline _CCCL_DEVICE _Type __atomic_fetch_min_cuda(_Type* __ptr, _Up __op, int __memorder, _Sco) +{ + using __proxy_t = typename __atomic_cuda_deduce_minmax<_Type>::__type; + using __proxy_tag = typename __atomic_cuda_deduce_minmax<_Type>::__tag; + _Type __dst{}; + __proxy_t* __ptr_proxy = reinterpret_cast<__proxy_t*>(__ptr); + __proxy_t* __dst_proxy = reinterpret_cast<__proxy_t*>(&__dst); + __proxy_t* __op_proxy = reinterpret_cast<__proxy_t*>(&__op); + if (__cuda_fetch_min_weak_if_local(__ptr_proxy, *__op_proxy, __dst_proxy)) + { + return __dst; + } + return __atomic_fetch_min_nvvm_dispatch( + __ptr_proxy, *__op_proxy, __memorder, __atomic_scope_tag_to_nvvm_scope(_Sco{})); +} +template = 0> +static inline _CCCL_DEVICE _Type __atomic_fetch_min_cuda(_Type volatile* __ptr, _Up __op, int __memorder, _Sco) +{ + using __proxy_t = typename __atomic_cuda_deduce_minmax<_Type>::__type; + using __proxy_tag = typename __atomic_cuda_deduce_minmax<_Type>::__tag; + _Type __dst{}; + __proxy_t* __ptr_proxy = reinterpret_cast<__proxy_t*>(const_cast<_Type*>(__ptr)); + __proxy_t* __dst_proxy = reinterpret_cast<__proxy_t*>(&__dst); + __proxy_t* __op_proxy = reinterpret_cast<__proxy_t*>(&__op); + if (__cuda_fetch_min_weak_if_local(__ptr_proxy, *__op_proxy, __dst_proxy)) + { + return __dst; + } + return __atomic_fetch_min_nvvm_dispatch( + __ptr_proxy, *__op_proxy, __memorder, __atomic_scope_tag_to_nvvm_scope(_Sco{})); +} + +template = 0> +static inline _CCCL_DEVICE _Type __atomic_fetch_or_cuda(_Type* __ptr, _Up __op, int __memorder, _Sco) +{ + using __proxy_t = typename __atomic_cuda_deduce_bitwise<_Type>::__type; + using __proxy_tag = typename __atomic_cuda_deduce_bitwise<_Type>::__tag; + _Type __dst{}; + __proxy_t* __ptr_proxy = reinterpret_cast<__proxy_t*>(__ptr); + __proxy_t* __dst_proxy = reinterpret_cast<__proxy_t*>(&__dst); + __proxy_t* __op_proxy = reinterpret_cast<__proxy_t*>(&__op); + if (__cuda_fetch_or_weak_if_local(__ptr_proxy, *__op_proxy, __dst_proxy)) + { + return __dst; + } + return __atomic_fetch_or_nvvm_dispatch(__ptr_proxy, *__op_proxy, __memorder, __atomic_scope_tag_to_nvvm_scope(_Sco{})); +} +template = 0> +static inline _CCCL_DEVICE _Type __atomic_fetch_or_cuda(_Type volatile* __ptr, _Up __op, int __memorder, _Sco) +{ + using __proxy_t = typename __atomic_cuda_deduce_bitwise<_Type>::__type; + using __proxy_tag = typename __atomic_cuda_deduce_bitwise<_Type>::__tag; + _Type __dst{}; + __proxy_t* __ptr_proxy = reinterpret_cast<__proxy_t*>(const_cast<_Type*>(__ptr)); + __proxy_t* __dst_proxy = reinterpret_cast<__proxy_t*>(&__dst); + __proxy_t* __op_proxy = reinterpret_cast<__proxy_t*>(&__op); + if (__cuda_fetch_or_weak_if_local(__ptr_proxy, *__op_proxy, __dst_proxy)) + { + return __dst; + } + return __atomic_fetch_or_nvvm_dispatch(__ptr_proxy, *__op_proxy, __memorder, __atomic_scope_tag_to_nvvm_scope(_Sco{})); +} + +template = 0> +static inline _CCCL_DEVICE _Type __atomic_fetch_xor_cuda(_Type* __ptr, _Up __op, int __memorder, _Sco) +{ + using __proxy_t = typename __atomic_cuda_deduce_bitwise<_Type>::__type; + using __proxy_tag = typename __atomic_cuda_deduce_bitwise<_Type>::__tag; + _Type __dst{}; + __proxy_t* __ptr_proxy = reinterpret_cast<__proxy_t*>(__ptr); + __proxy_t* __dst_proxy = reinterpret_cast<__proxy_t*>(&__dst); + __proxy_t* __op_proxy = reinterpret_cast<__proxy_t*>(&__op); + if (__cuda_fetch_xor_weak_if_local(__ptr_proxy, *__op_proxy, __dst_proxy)) + { + return __dst; + } + return __atomic_fetch_xor_nvvm_dispatch( + __ptr_proxy, *__op_proxy, __memorder, __atomic_scope_tag_to_nvvm_scope(_Sco{})); +} +template = 0> +static inline _CCCL_DEVICE _Type __atomic_fetch_xor_cuda(_Type volatile* __ptr, _Up __op, int __memorder, _Sco) +{ + using __proxy_t = typename __atomic_cuda_deduce_bitwise<_Type>::__type; + using __proxy_tag = typename __atomic_cuda_deduce_bitwise<_Type>::__tag; + _Type __dst{}; + __proxy_t* __ptr_proxy = reinterpret_cast<__proxy_t*>(const_cast<_Type*>(__ptr)); + __proxy_t* __dst_proxy = reinterpret_cast<__proxy_t*>(&__dst); + __proxy_t* __op_proxy = reinterpret_cast<__proxy_t*>(&__op); + if (__cuda_fetch_xor_weak_if_local(__ptr_proxy, *__op_proxy, __dst_proxy)) + { + return __dst; + } + return __atomic_fetch_xor_nvvm_dispatch( + __ptr_proxy, *__op_proxy, __memorder, __atomic_scope_tag_to_nvvm_scope(_Sco{})); +} + +template +static inline _CCCL_DEVICE _Type __atomic_fetch_add_cuda(_Type* __ptr, _Up __op, int __memorder, _Sco) +{ + constexpr auto __skip_v = 1; + __op = __op * __skip_v; + using __proxy_t = typename __atomic_cuda_deduce_bitwise<_Type>::__type; + using __proxy_tag = typename __atomic_cuda_deduce_bitwise<_Type>::__tag; + _Type __dst{}; + __proxy_t* __ptr_proxy = reinterpret_cast<__proxy_t*>(const_cast<_Type*>(__ptr)); + __proxy_t* __dst_proxy = reinterpret_cast<__proxy_t*>(&__dst); + __proxy_t* __op_proxy = reinterpret_cast<__proxy_t*>(&__op); + if (__cuda_fetch_add_weak_if_local(__ptr_proxy, *__op_proxy, __dst_proxy)) + { + return __dst; + } + return __atomic_fetch_add_nvvm_dispatch( + __ptr_proxy, *__op_proxy, __memorder, __atomic_scope_tag_to_nvvm_scope(_Sco{})); +} +template +static inline _CCCL_DEVICE _Type __atomic_fetch_add_cuda(volatile _Type* __ptr, _Up __op, int __memorder, _Sco) +{ + constexpr auto __skip_v = 1; + __op = __op * __skip_v; + using __proxy_t = typename __atomic_cuda_deduce_bitwise<_Type>::__type; + using __proxy_tag = typename __atomic_cuda_deduce_bitwise<_Type>::__tag; + _Type __dst{}; + __proxy_t* __ptr_proxy = reinterpret_cast<__proxy_t*>(const_cast<_Type*>(__ptr)); + __proxy_t* __dst_proxy = reinterpret_cast<__proxy_t*>(&__dst); + __proxy_t* __op_proxy = reinterpret_cast<__proxy_t*>(&__op); + if (__cuda_fetch_add_weak_if_local(__ptr_proxy, *__op_proxy, __dst_proxy)) + { + return __dst; + } + return __atomic_fetch_add_nvvm_dispatch( + __ptr_proxy, *__op_proxy, __memorder, __atomic_scope_tag_to_nvvm_scope(_Sco{})); +} + +template +static inline _CCCL_DEVICE _Type __atomic_fetch_sub_cuda(_Type* __ptr, _Up __op, int __memorder, _Sco) +{ + return __atomic_fetch_add_cuda(__ptr, -__op, __memorder, _Sco{}); +} +template +static inline _CCCL_DEVICE _Type __atomic_fetch_sub_cuda(_Type volatile* __ptr, _Up __op, int __memorder, _Sco) +{ + return __atomic_fetch_add_cuda(__ptr, -__op, __memorder, _Sco{}); +} + +# endif // _CCCL_HAS_CUDA_COMPILER() + +_CCCL_END_NAMESPACE_CUDA_STD + +# include + +#endif // _CUDA_STD___ATOMIC_BACKENDS_CUDA_NVVM_H + +// clang-format on diff --git a/libcudacxx/include/cuda/std/__atomic/backends/cuda_nvvm_fallbacks.h b/libcudacxx/include/cuda/std/__atomic/backends/cuda_nvvm_fallbacks.h new file mode 100644 index 00000000000..b6b0f9ef4fb --- /dev/null +++ b/libcudacxx/include/cuda/std/__atomic/backends/cuda_nvvm_fallbacks.h @@ -0,0 +1,228 @@ +//===----------------------------------------------------------------------===// +// +// Part of libcu++, the C++ Standard Library for your entire system, +// under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. +// +//===----------------------------------------------------------------------===// + +#ifndef __CUDA_STD___ATOMIC_BACKENDS_DEVICE_FALLBACKS_H +#define __CUDA_STD___ATOMIC_BACKENDS_DEVICE_FALLBACKS_H + +#include + +#if defined(_CCCL_IMPLICIT_SYSTEM_HEADER_GCC) +# pragma GCC system_header +#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_CLANG) +# pragma clang system_header +#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_MSVC) +# pragma system_header +#endif // no system header + +#include +#include +#include +#include +#include + +#include + +_CCCL_BEGIN_NAMESPACE_CUDA_STD + +#if _CCCL_CUDA_COMPILATION() + +template +struct __atomic_nvvm_dispatch_helper +{ + using __bitwise = __atomic_cuda_deduce_bitwise<_Type>; + using __arithmetic = __atomic_cuda_deduce_arithmetic<_Type>; + using __minmax = __atomic_cuda_deduce_minmax<_Type>; + + static constexpr __atomic_nvvm_is_native_arithmetic = + /* fp16 and up */ ((_Operand::__size >= 16) && (_Operand::__op == __atomic_cuda_operand::_f)) || + /* 32 bits and up */ ((_Operand::__size >= 32)); + + static constexpr __atomic_nvvm_is_native_arithmetic = + /* fp16 and up */ ((_Operand::__size >= 16) && (_Operand::__op == __atomic_cuda_operand::_f)) || + /* 32 bits and up */ ((_Operand::__size >= 32)); + + static constexpr __atomic_nvvm_is_native_bitwise = + /* 32 bits and up */ ((__bitwise::__size >= 32)); + + static constexpr __atomic_nvvm_is_native_cas = + /* 16 bits and up */ ((__bitwise::__size >= 16)); + + // Native ld/st differs from PTX due to missing 8 bit constraints in inline PTX + static constexpr __atomic_nvvm_is_native_ld_st = + /* 8 bits and up */ ((__bitwise::__size >= 8)); + + using __enable_if_native_arithmetic = enable_if_t<__atomic_nvvm_is_native_arithmetic, bool>; + using __enable_if_not_native_arithmetic = enable_if_t; + + using __enable_if_native_minmax = enable_if_t<__atomic_nvvm_is_native_minmax, bool>; + using __enable_if_not_native_minmax = enable_if_t; + + using __enable_if_native_bitwise = enable_if_t<__atomic_nvvm_is_native_bitwise, bool>; + using __enable_if_not_native_bitwise = enable_if_t; + + using __enable_if_native_cas = enable_if_t<__atomic_nvvm_is_native_cas, bool>; + using __enable_if_not_native_cas = enable_if_t; + + using __enable_if_native_ld_st = enable_if_t<__atomic_nvvm_is_native_ld_st, bool>; + using __enable_if_not_native_ld_st = enable_if_t; +}; + +template ::__enable_if_not_native_ld_st = 0> +_CCCL_ARTIFICIAL static inline _CCCL_DEVICE void +__atomic_load_nvvm_dispatch(const _Type* __ptr, _Type* __dst, int __memorder, int __sco) +{} + +template ::__enable_if_not_native_ld_st = 0> +_CCCL_ARTIFICIAL static inline _CCCL_DEVICE void +__atomic_store_nvvm_dispatch(_Type* __ptr, _Type* __val, int __memorder, int __sco) +{} + +template ::__enable_if_not_native_cas = 0> +_CCCL_ARTIFICIAL static inline _CCCL_DEVICE bool __atomic_compare_exchange_nvvm_dispatch( + _Type* __ptr, _Type* __exp, _Type* __des, bool __weak, int __success_memorder, int __failure_memorder, int __sco) +{} + +template ::__enable_if_not_native_cas = 0> +_CCCL_ARTIFICIAL static inline _CCCL_DEVICE void +__atomic_exchange_nvvm_dispatch(_Type* __atom, _Type* __val, _Type* __ret, int __memorder, int __sco) +{} + +template +_CCCL_ARTIFICIAL static inline _CCCL_DEVICE _Type +__atomic_fetch_max_nvvm_dispatch(_Type* __ptr, _Type __op, int __memorder, int __sco) +{} + +template +_CCCL_ARTIFICIAL static inline _CCCL_DEVICE _Type +__atomic_fetch_min_nvvm_dispatch(_Type* __ptr, _Type __op, int __memorder, int __sco) +{} + +template +_CCCL_ARTIFICIAL static inline _CCCL_DEVICE _Type +__atomic_fetch_and_nvvm_dispatch(_Type* __ptr, _Type __op, int __memorder, int __sco) +{} + +template +_CCCL_ARTIFICIAL static inline _CCCL_DEVICE _Type +__atomic_fetch_or_nvvm_dispatch(_Type* __ptr, _Type __op, int __memorder, int __sco) +{} + +template +_CCCL_ARTIFICIAL static inline _CCCL_DEVICE _Type +__atomic_fetch_xor_nvvm_dispatch(_Type* __ptr, _Type __op, int __memorder, int __sco) +{} + +template +_CCCL_ARTIFICIAL static inline _CCCL_DEVICE _Type +__atomic_fetch_add_nvvm_dispatch(_Type* __ptr, _Type __op, int __memorder, int __sco) +{} + +template = 0> +static inline _CCCL_DEVICE void __atomic_load_nonnative(const _Type* __ptr, _Type& __dst, _Order, _Operand, _Sco) +{ + constexpr uint64_t __alignmask = (sizeof(uint16_t) - 1); + uint16_t* __aligned = (uint16_t*) ((intptr_t) __ptr & (~__alignmask)); + const uint8_t __offset = uint16_t((intptr_t) __ptr & __alignmask) * 8; + + uint16_t __value = 0; + __cuda_atomic_load(__aligned, __value, _Order{}, __atomic_cuda_operand_b16{}, _Sco{}, __atomic_cuda_mmio_disable{}); + + __dst = static_cast<_Type>(__value >> __offset); +} + +template = 0> +static inline _CCCL_DEVICE bool +__atomic_cas_nonnative(_Type* __ptr, _Type& __dst, _Type __cmp, _Type __op, _Order, _Operand, _Sco) +{ + constexpr uint64_t __alignmask = (sizeof(uint32_t) - 1); + constexpr uint32_t __sizemask = (1 << (sizeof(_Type) * 8)) - 1; + uint32_t* __aligned = (uint32_t*) ((intptr_t) __ptr & (~__alignmask)); + const uint8_t __offset = uint32_t((intptr_t) __ptr & __alignmask) * 8; + const uint32_t __valueMask = __sizemask << __offset; + const uint32_t __windowMask = ~__valueMask; + const uint32_t __cmpOffset = __cmp << __offset; + const uint32_t __opOffset = __op << __offset; + + // Algorithm for 8b CAS with 32b intrinsics + // __old = __window[0:32] where [__cmp] resides within some offset. + uint32_t __old; + // Start by loading __old with the current value, this optimizes for early return when __cmp is wrong + NV_IF_TARGET( + NV_PROVIDES_SM_70, + (__cuda_atomic_load( + __aligned, __old, __atomic_cuda_relaxed{}, __atomic_cuda_operand_b32{}, _Sco{}, __atomic_cuda_mmio_disable{});), + (__cuda_atomic_load( + __aligned, __old, __atomic_cuda_volatile{}, __atomic_cuda_operand_b32{}, _Sco{}, __atomic_cuda_mmio_disable{});)) + // Reemit CAS instructions until we succeed or the old value is a mismatch + while (__cmpOffset == (__old & __valueMask)) + { + // Combine the desired value and most recently fetched expected masked portion of the window + const uint32_t __attempt = (__old & __windowMask) | __opOffset; + + if (__cuda_atomic_compare_exchange( + __aligned, __old, __old, __attempt, _Order{}, __atomic_cuda_operand_b32{}, _Sco{})) + { + // CAS was successful + return true; + } + } + __dst = static_cast<_Type>(__old >> __offset); + return false; +} + +// Optimized fetch_update CAS loop with op determined after first load reducing waste. +template = 0> +_CCCL_DEVICE _Type __atomic_fetch_update_nonnative(_Type* __ptr, const _Fn& __op, _Order, _Operand, _Sco) +{ + constexpr uint64_t __alignmask = (sizeof(uint32_t) - 1); + constexpr uint32_t __sizemask = (1 << (sizeof(_Type) * 8)) - 1; + uint32_t* __aligned = (uint32_t*) ((intptr_t) __ptr & (~__alignmask)); + const uint8_t __offset = uint8_t((intptr_t) __ptr & __alignmask) * 8; + const uint32_t __valueMask = __sizemask << __offset; + const uint32_t __windowMask = ~__valueMask; + + // 8/16b fetch update is similar to CAS implementation, but compresses the logic for recalculating the operand + // __old = __window[0:32] where [__cmp] resides within some offset. + uint32_t __old; + NV_IF_TARGET( + NV_PROVIDES_SM_70, + (__cuda_atomic_load( + __aligned, __old, __atomic_cuda_relaxed{}, __atomic_cuda_operand_b32{}, _Sco{}, __atomic_cuda_mmio_disable{});), + (__cuda_atomic_load( + __aligned, __old, __atomic_cuda_volatile{}, __atomic_cuda_operand_b32{}, _Sco{}, __atomic_cuda_mmio_disable{});)) + + // Reemit CAS instructions until we succeed + while (1) + { + // Calculate new desired value from last fetched __old + // Use of the value mask is required due to the possibility of overflow when ops are widened. Possible compiler bug? + const uint32_t __attempt = + ((static_cast(__op(static_cast<_Type>(__old >> __offset))) << __offset) & __valueMask) + | (__old & __windowMask); + + if (__cuda_atomic_compare_exchange( + __aligned, __old, __old, __attempt, _Order{}, __atomic_cuda_operand_b32{}, _Sco{})) + { + // CAS was successful + return static_cast<_Type>(__old >> __offset); + } + } +} + +#endif // ^_CCCL_CUDA_COMPILATION() + +_CCCL_END_NAMESPACE_CUDA_STD + +#endif // __CUDA_STD___ATOMIC_BACKENDS_DEVICE_FALLBACKS_H diff --git a/libcudacxx/include/cuda/std/__atomic/backends/cuda_nvvm_wrapped.h b/libcudacxx/include/cuda/std/__atomic/backends/cuda_nvvm_wrapped.h new file mode 100644 index 00000000000..451afc7121a --- /dev/null +++ b/libcudacxx/include/cuda/std/__atomic/backends/cuda_nvvm_wrapped.h @@ -0,0 +1,268 @@ +//===----------------------------------------------------------------------===// +// +// Part of libcu++, the C++ Standard Library for your entire system, +// under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. +// +//===----------------------------------------------------------------------===// + +#ifndef _CUDA_STD___ATOMIC_BACKENDS_CUDA_NVVM_WRAPPED_H +#define _CUDA_STD___ATOMIC_BACKENDS_CUDA_NVVM_WRAPPED_H + +#include + +#if defined(_CCCL_IMPLICIT_SYSTEM_HEADER_GCC) +# pragma GCC system_header +#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_CLANG) +# pragma clang system_header +#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_MSVC) +# pragma system_header +#endif // no system header + +#include +#include +#include + +#include + +_CCCL_BEGIN_NAMESPACE_CUDA_STD + +#if _CCCL_CUDA_COMPILATION() + +# define __ATOMIC_NVVM_WRAP(...) (__VA_ARGS__) +# define __ATOMIC_NVVM_UNWRAP1(...) __VA_ARGS__ +# define __ATOMIC_NVVM_UNWRAP(...) __ATOMIC_NVVM_UNWRAP1 __VA_ARGS__ + +# define __ATOMIC_SWITCH(val, ...) \ + switch (val) \ + { \ + __VA_ARGS__ \ + } + +# define __ATOMIC_CASE(test, fn, ...) \ + case test: \ + _CCCL_PP_OBSTRUCT(fn)(test, ##__VA_ARGS__) break; + +# define __ATOMIC_SCOPE_CASES_SM90(...) \ + /* THREAD */ __ATOMIC_CASE(__NV_THREAD_SCOPE_THREAD, __VA_ARGS__) \ + /* BLOCK */ __ATOMIC_CASE(__NV_THREAD_SCOPE_BLOCK, __VA_ARGS__) \ + /* DEVICE */ __ATOMIC_CASE(__NV_THREAD_SCOPE_DEVICE, __VA_ARGS__) \ + /* SYSTEM */ __ATOMIC_CASE(__NV_THREAD_SCOPE_SYSTEM, __VA_ARGS__) \ + /* CLUSTER */ __ATOMIC_CASE(__NV_THREAD_SCOPE_CLUSTER, __VA_ARGS__) + +# define __ATOMIC_SCOPE_CASES(...) \ + /* THREAD */ __ATOMIC_CASE(__NV_THREAD_SCOPE_THREAD, __VA_ARGS__) \ + /* BLOCK */ __ATOMIC_CASE(__NV_THREAD_SCOPE_BLOCK, __VA_ARGS__) \ + /* DEVICE */ __ATOMIC_CASE(__NV_THREAD_SCOPE_DEVICE, __VA_ARGS__) \ + /* SYSTEM */ __ATOMIC_CASE(__NV_THREAD_SCOPE_SYSTEM, __VA_ARGS__) + +# define __ATOMIC_ALL_ORDER_CASES(...) \ + /* RELAXED */ __ATOMIC_CASE(__NV_ATOMIC_RELAXED, __VA_ARGS__) \ + /* CONSUME */ __ATOMIC_CASE(__NV_ATOMIC_CONSUME, __VA_ARGS__) \ + /* ACQUIRE */ __ATOMIC_CASE(__NV_ATOMIC_ACQUIRE, __VA_ARGS__) \ + /* RELEASE */ __ATOMIC_CASE(__NV_ATOMIC_RELEASE, __VA_ARGS__) \ + /* ACQ_REL */ __ATOMIC_CASE(__NV_ATOMIC_ACQ_REL, __VA_ARGS__) \ + /* SEQ_CST */ __ATOMIC_CASE(__NV_ATOMIC_SEQ_CST, __VA_ARGS__) + +# define __ATOMIC_READ_CASES(...) \ + /* RELAXED */ __ATOMIC_CASE(__NV_ATOMIC_RELAXED, __VA_ARGS__) \ + /* CONSUME */ __ATOMIC_CASE(__NV_ATOMIC_CONSUME, __VA_ARGS__) \ + /* ACQUIRE */ __ATOMIC_CASE(__NV_ATOMIC_ACQUIRE, __VA_ARGS__) \ + /* SEQ_CST */ __ATOMIC_CASE(__NV_ATOMIC_SEQ_CST, __VA_ARGS__) + +# define __ATOMIC_WRITE_CASES(...) \ + /* RELAXED */ __ATOMIC_CASE(__NV_ATOMIC_RELAXED, __VA_ARGS__) \ + /* RELEASE */ __ATOMIC_CASE(__NV_ATOMIC_RELEASE, __VA_ARGS__) \ + /* SEQ_CST */ __ATOMIC_CASE(__NV_ATOMIC_SEQ_CST, __VA_ARGS__) + +# define __ATOMIC_FENCE_CASES(...) __ATOMIC_ALL_ORDER_CASES(__VA_ARGS__) +# define __ATOMIC_EXCHANGE_CASES(...) __ATOMIC_ALL_ORDER_CASES(__VA_ARGS__) +# define __ATOMIC_FETCH_OP_CASES(...) __ATOMIC_ALL_ORDER_CASES(__VA_ARGS__) + +# define __ATOMIC_COMPARE_SUCCESS_CASES(...) __ATOMIC_ALL_ORDER_CASES(__VA_ARGS__) +# define __ATOMIC_COMPARE_FAILURE_CASES(...) __ATOMIC_READ_CASES(__VA_ARGS__) + +# define __ATOMIC_SCOPES_SWITCH(scope, scopes, ...) __ATOMIC_SWITCH(scope, scopes(__VA_ARGS__)) +# define __ATOMIC_ORDER_SWITCH(order, orders, ...) __ATOMIC_SWITCH(order, orders(__VA_ARGS__)) + +# define __ATOMIC_NVVM_BUILTIN2(_scope, intrinsic, ...) intrinsic(__ATOMIC_NVVM_UNWRAP(__VA_ARGS__), _scope); +# define __ATOMIC_NVVM_BUILTIN1(_order, intrinsic, scope, scopes, ...) \ + __ATOMIC_SCOPES_SWITCH( \ + scope, scopes, __ATOMIC_NVVM_BUILTIN2, intrinsic, __ATOMIC_NVVM_WRAP(__ATOMIC_NVVM_UNWRAP(__VA_ARGS__), _order)) +# define __ATOMIC_NVVM_BUILTIN0(_order, intrinsic, order, orders, scope, scopes, ...) \ + __ATOMIC_ORDER_SWITCH( \ + order, \ + orders, \ + __ATOMIC_NVVM_BUILTIN1, \ + intrinsic, \ + scope, \ + scopes, \ + __ATOMIC_NVVM_WRAP(__ATOMIC_NVVM_UNWRAP(__VA_ARGS__), _order)) + +// An attempted explanation: +// We pass down macro function names and arguments through functions that create switch statements, the cases expand +// them by eventually invoking the passed in `__ATOMIC_NVVM_BUILTIN#` with the now concrete case value selected in the +// switch - This then calls another switch builder, uses another macro function, and expands again. Arguments to the +// function are packed inside of `()` by __ATOMIC_NVVM_WRAP/UNWRAP in order to prevent any accidental escape. +# define __ATOMIC_NVVM_BUILTIN(intrinsic, order, orders, scope, ...) \ + NV_IF_ELSE_TARGET( \ + NV_PROVIDES_SM_90, \ + ({__ATOMIC_ORDER_SWITCH( \ + order, \ + orders, \ + __ATOMIC_NVVM_BUILTIN1, \ + intrinsic, \ + scope, \ + __ATOMIC_SCOPE_CASES_SM90, \ + __ATOMIC_NVVM_WRAP(__VA_ARGS__))}), \ + ({__ATOMIC_ORDER_SWITCH( \ + order, \ + orders, \ + __ATOMIC_NVVM_BUILTIN1, \ + intrinsic, \ + scope, \ + __ATOMIC_SCOPE_CASES, \ + __ATOMIC_NVVM_WRAP(__VA_ARGS__))})) + +// __ATOMIC_NVVM_BUILTIN_SF selects three times for compare_exchange +# define __ATOMIC_NVVM_BUILTIN_SF(intrinsic, success, sorders, failure, forders, scope, ...) \ + NV_IF_ELSE_TARGET( \ + NV_PROVIDES_SM_90, \ + ({__ATOMIC_ORDER_SWITCH( \ + success, \ + sorders, \ + __ATOMIC_NVVM_BUILTIN0, \ + intrinsic, \ + failure, \ + forders, \ + scope, \ + __ATOMIC_SCOPE_CASES_SM90, \ + __ATOMIC_NVVM_WRAP(__VA_ARGS__))}), \ + ({__ATOMIC_ORDER_SWITCH( \ + success, \ + sorders, \ + __ATOMIC_NVVM_BUILTIN0, \ + intrinsic, \ + failure, \ + forders, \ + scope, \ + __ATOMIC_SCOPE_CASES, \ + __ATOMIC_NVVM_WRAP(__VA_ARGS__))})) + +template +_CCCL_ARTIFICIAL static inline _CCCL_DEVICE void +__atomic_thread_fence_nvvm_dispatch(const _Type* __ptr, _Type* __dst, int __memorder, int __sco) +{ + _CCCL_PP_EXPAND(__ATOMIC_NVVM_BUILTIN(__nv_atomic_thread_fence, __memorder, __ATOMIC_FENCE_CASES, __sco)); +} + +template +_CCCL_ARTIFICIAL static inline _CCCL_DEVICE void +__atomic_load_nvvm_dispatch(const _Type* __ptr, _Type* __dst, int __memorder, int __sco) +{ + _CCCL_PP_EXPAND(__ATOMIC_NVVM_BUILTIN(__nv_atomic_load, __memorder, __ATOMIC_READ_CASES, __sco, __ptr, __dst)); +} + +template +_CCCL_ARTIFICIAL static inline _CCCL_DEVICE void +__atomic_store_nvvm_dispatch(_Type* __ptr, _Type* __val, int __memorder, int __sco) +{ + _CCCL_PP_EXPAND(__ATOMIC_NVVM_BUILTIN(__nv_atomic_store, __memorder, __ATOMIC_WRITE_CASES, __sco, __ptr, __val)); +} + +template +_CCCL_ARTIFICIAL static inline _CCCL_DEVICE bool __atomic_compare_exchange_nvvm_dispatch( + _Type* __ptr, _Type* __exp, _Type* __des, bool __weak, int __success_memorder, int __failure_memorder, int __sco) +{ + _CCCL_PP_EXPAND(__ATOMIC_NVVM_BUILTIN_SF( + return __nv_atomic_compare_exchange, + __success_memorder, + __ATOMIC_COMPARE_SUCCESS_CASES, + __failure_memorder, + __ATOMIC_COMPARE_FAILURE_CASES, + __sco, + __ptr, + __exp, + __des, + __weak)); + _CCCL_UNREACHABLE(); + return {}; +} + +template +_CCCL_ARTIFICIAL static inline _CCCL_DEVICE void +__atomic_exchange_nvvm_dispatch(_Type* __atom, _Type* __val, _Type* __ret, int __memorder, int __sco) +{ + _CCCL_PP_EXPAND( + __ATOMIC_NVVM_BUILTIN(__nv_atomic_exchange, __memorder, __ATOMIC_EXCHANGE_CASES, __sco, __atom, __val, __ret)); +} + +template +_CCCL_ARTIFICIAL static inline _CCCL_DEVICE _Type +__atomic_fetch_max_nvvm_dispatch(_Type* __ptr, _Type __op, int __memorder, int __sco) +{ + _CCCL_PP_EXPAND( + __ATOMIC_NVVM_BUILTIN(return __nv_atomic_fetch_max, __memorder, __ATOMIC_EXCHANGE_CASES, __sco, __ptr, __op)); + _CCCL_UNREACHABLE(); + return {}; +} + +template +_CCCL_ARTIFICIAL static inline _CCCL_DEVICE _Type +__atomic_fetch_min_nvvm_dispatch(_Type* __ptr, _Type __op, int __memorder, int __sco) +{ + _CCCL_PP_EXPAND( + __ATOMIC_NVVM_BUILTIN(return __nv_atomic_fetch_min, __memorder, __ATOMIC_EXCHANGE_CASES, __sco, __ptr, __op)); + _CCCL_UNREACHABLE(); + return {}; +} + +template +_CCCL_ARTIFICIAL static inline _CCCL_DEVICE _Type +__atomic_fetch_and_nvvm_dispatch(_Type* __ptr, _Type __op, int __memorder, int __sco) +{ + _CCCL_PP_EXPAND( + __ATOMIC_NVVM_BUILTIN(return __nv_atomic_fetch_and, __memorder, __ATOMIC_EXCHANGE_CASES, __sco, __ptr, __op)); + _CCCL_UNREACHABLE(); + return {}; +} + +template +_CCCL_ARTIFICIAL static inline _CCCL_DEVICE _Type +__atomic_fetch_or_nvvm_dispatch(_Type* __ptr, _Type __op, int __memorder, int __sco) +{ + _CCCL_PP_EXPAND( + __ATOMIC_NVVM_BUILTIN(return __nv_atomic_fetch_or, __memorder, __ATOMIC_EXCHANGE_CASES, __sco, __ptr, __op)); + _CCCL_UNREACHABLE(); + return {}; +} + +template +_CCCL_ARTIFICIAL static inline _CCCL_DEVICE _Type +__atomic_fetch_xor_nvvm_dispatch(_Type* __ptr, _Type __op, int __memorder, int __sco) +{ + _CCCL_PP_EXPAND( + __ATOMIC_NVVM_BUILTIN(return __nv_atomic_fetch_xor, __memorder, __ATOMIC_EXCHANGE_CASES, __sco, __ptr, __op)); + _CCCL_UNREACHABLE(); + return {}; +} + +template +_CCCL_ARTIFICIAL static inline _CCCL_DEVICE _Type +__atomic_fetch_add_nvvm_dispatch(_Type* __ptr, _Type __op, int __memorder, int __sco) +{ + _CCCL_PP_EXPAND( + __ATOMIC_NVVM_BUILTIN(return __nv_atomic_fetch_add, __memorder, __ATOMIC_EXCHANGE_CASES, __sco, __ptr, __op)); + _CCCL_UNREACHABLE(); + return {}; +} + +#endif // _CCCL_CUDA_COMPILATION + +_CCCL_END_NAMESPACE_CUDA_STD + +#include + +#endif // _CUDA_STD___ATOMIC_BACKENDS_CUDA_NVVM_WRAPPED_H diff --git a/libcudacxx/include/cuda/std/__atomic/functions/cuda_ptx_derived.h b/libcudacxx/include/cuda/std/__atomic/backends/cuda_ptx_derived.h similarity index 98% rename from libcudacxx/include/cuda/std/__atomic/functions/cuda_ptx_derived.h rename to libcudacxx/include/cuda/std/__atomic/backends/cuda_ptx_derived.h index 7ebca48711f..56e17c520d6 100644 --- a/libcudacxx/include/cuda/std/__atomic/functions/cuda_ptx_derived.h +++ b/libcudacxx/include/cuda/std/__atomic/backends/cuda_ptx_derived.h @@ -8,8 +8,8 @@ // //===----------------------------------------------------------------------===// -#ifndef __CUDA_STD___ATOMIC_FUNCTIONS_DERIVED_H -#define __CUDA_STD___ATOMIC_FUNCTIONS_DERIVED_H +#ifndef __CUDA_STD___ATOMIC_BACKENDS_DERIVED_H +#define __CUDA_STD___ATOMIC_BACKENDS_DERIVED_H #include @@ -21,7 +21,7 @@ # pragma system_header #endif // no system header -#include +#include #include #include #include @@ -448,4 +448,4 @@ _CCCL_END_NAMESPACE_CUDA_STD #include -#endif // __CUDA_STD___ATOMIC_FUNCTIONS_DERIVED_H +#endif // __CUDA_STD___ATOMIC_BACKENDS_DERIVED_H diff --git a/libcudacxx/include/cuda/std/__atomic/functions/cuda_ptx_generated.h b/libcudacxx/include/cuda/std/__atomic/backends/cuda_ptx_generated.h similarity index 99% rename from libcudacxx/include/cuda/std/__atomic/functions/cuda_ptx_generated.h rename to libcudacxx/include/cuda/std/__atomic/backends/cuda_ptx_generated.h index f3e30d53039..4f701866828 100644 --- a/libcudacxx/include/cuda/std/__atomic/functions/cuda_ptx_generated.h +++ b/libcudacxx/include/cuda/std/__atomic/backends/cuda_ptx_generated.h @@ -11,8 +11,8 @@ // This is an autogenerated file, we want to ensure that it contains exactly the contents we want to generate // clang-format off -#ifndef _CUDA_STD___ATOMIC_FUNCTIONS_CUDA_PTX_GENERATED_H -#define _CUDA_STD___ATOMIC_FUNCTIONS_CUDA_PTX_GENERATED_H +#ifndef _CUDA_STD___ATOMIC_BACKENDS_CUDA_PTX_GENERATED_H +#define _CUDA_STD___ATOMIC_BACKENDS_CUDA_PTX_GENERATED_H #include @@ -33,9 +33,9 @@ #include #include -#include -#include -#include +#include +#include +#include #include @@ -1371,7 +1371,7 @@ static inline _CCCL_DEVICE void __atomic_store_cuda(_Type* __ptr, _Type& __val, __cuda_atomic_store_memory_order_dispatch(__bound_store, __memorder, _Sco{}); } template -static inline _CCCL_DEVICE void __atomic_store_cuda(volatile _Type* __ptr, _Type& __val, int __memorder, _Sco) +static inline _CCCL_DEVICE void __atomic_store_cuda(_Type volatile* __ptr, _Type& __val, int __memorder, _Sco) { using __proxy_t = typename __atomic_cuda_deduce_bitwise<_Type>::__type; using __proxy_tag = typename __atomic_cuda_deduce_bitwise<_Type>::__tag; @@ -4431,6 +4431,6 @@ _CCCL_END_NAMESPACE_CUDA_STD #include -#endif // _CUDA_STD___ATOMIC_FUNCTIONS_CUDA_PTX_GENERATED_H +#endif // _CUDA_STD___ATOMIC_BACKENDS_CUDA_PTX_GENERATED_H // clang-format on diff --git a/libcudacxx/include/cuda/std/__atomic/functions/cuda_ptx_generated_helper.h b/libcudacxx/include/cuda/std/__atomic/backends/cuda_supported_atomics_helper.h similarity index 97% rename from libcudacxx/include/cuda/std/__atomic/functions/cuda_ptx_generated_helper.h rename to libcudacxx/include/cuda/std/__atomic/backends/cuda_supported_atomics_helper.h index cd221248b99..e462f09512e 100644 --- a/libcudacxx/include/cuda/std/__atomic/functions/cuda_ptx_generated_helper.h +++ b/libcudacxx/include/cuda/std/__atomic/backends/cuda_supported_atomics_helper.h @@ -8,8 +8,8 @@ // //===----------------------------------------------------------------------===// -#ifndef _CUDA_STD___ATOMIC_FUNCTIONS_CUDA_PTX_GENERATED_HELPER_H -#define _CUDA_STD___ATOMIC_FUNCTIONS_CUDA_PTX_GENERATED_HELPER_H +#ifndef _CUDA_STD___ATOMIC_BACKENDS_CUDA_SUPPORTED_ATOMICS_HELPER_H +#define _CUDA_STD___ATOMIC_BACKENDS_CUDA_SUPPORTED_ATOMICS_HELPER_H #include @@ -181,4 +181,4 @@ _CCCL_END_NAMESPACE_CUDA_STD #include -#endif // _CUDA_STD___ATOMIC_FUNCTIONS_CUDA_PTX_GENERATED_H +#endif // _CUDA_STD___ATOMIC_BACKENDS_CUDA_SUPPORTED_ATOMICS_HELPER_H diff --git a/libcudacxx/include/cuda/std/__atomic/functions/host.h b/libcudacxx/include/cuda/std/__atomic/backends/host.h similarity index 97% rename from libcudacxx/include/cuda/std/__atomic/functions/host.h rename to libcudacxx/include/cuda/std/__atomic/backends/host.h index 0a20a333ff8..48350d6ce57 100644 --- a/libcudacxx/include/cuda/std/__atomic/functions/host.h +++ b/libcudacxx/include/cuda/std/__atomic/backends/host.h @@ -8,8 +8,8 @@ // //===----------------------------------------------------------------------===// -#ifndef _CUDA_STD___ATOMICS_FUNCTIONS_HOST_H -#define _CUDA_STD___ATOMICS_FUNCTIONS_HOST_H +#ifndef _CUDA_STD___ATOMIC_BACKENDS_HOST_H +#define _CUDA_STD___ATOMIC_BACKENDS_HOST_H #include @@ -21,7 +21,7 @@ # pragma system_header #endif // no system header -#include +#include #include #include #include @@ -239,4 +239,4 @@ _CCCL_END_NAMESPACE_CUDA_STD #include -#endif // _CUDA_STD___ATOMICS_FUNCTIONS_HOST_H +#endif // _CUDA_STD___ATOMIC_BACKENDS_HOST_H diff --git a/libcudacxx/include/cuda/std/__atomic/order.h b/libcudacxx/include/cuda/std/__atomic/order.h index 6efe67f10a6..9aa8e6060c9 100644 --- a/libcudacxx/include/cuda/std/__atomic/order.h +++ b/libcudacxx/include/cuda/std/__atomic/order.h @@ -109,30 +109,43 @@ _CCCL_API inline int __stronger_order_cuda(int __a, int __b) _CCCL_API constexpr int __atomic_order_to_int(memory_order __order) { - // Avoid switch statement to make this a constexpr. - return __order == memory_order_relaxed - ? __ATOMIC_RELAXED - : (__order == memory_order_acquire - ? __ATOMIC_ACQUIRE - : (__order == memory_order_release - ? __ATOMIC_RELEASE - : (__order == memory_order_seq_cst - ? __ATOMIC_SEQ_CST - : (__order == memory_order_acq_rel ? __ATOMIC_ACQ_REL : __ATOMIC_CONSUME)))); + switch (__order) + { + default: + return __ATOMIC_CONSUME; + case memory_order_relaxed: + return __ATOMIC_RELAXED; + case memory_order_acquire: + return __ATOMIC_ACQUIRE; + case memory_order_release: + return __ATOMIC_RELEASE; + case memory_order_seq_cst: + return __ATOMIC_SEQ_CST; + case memory_order_acq_rel: + return __ATOMIC_ACQ_REL; + } } _CCCL_API constexpr int __atomic_failure_order_to_int(memory_order __order) { - // Avoid switch statement to make this a constexpr. - return __order == memory_order_relaxed - ? __ATOMIC_RELAXED - : (__order == memory_order_acquire - ? __ATOMIC_ACQUIRE - : (__order == memory_order_release - ? __ATOMIC_RELAXED - : (__order == memory_order_seq_cst - ? __ATOMIC_SEQ_CST - : (__order == memory_order_acq_rel ? __ATOMIC_ACQUIRE : __ATOMIC_CONSUME)))); + // Note: + // release -> relaxed + // acq_rel -> acquire + switch (__order) + { + default: + return __ATOMIC_CONSUME; + case memory_order_relaxed: + return __ATOMIC_RELAXED; + case memory_order_acquire: + return __ATOMIC_ACQUIRE; + case memory_order_release: + return __ATOMIC_RELAXED; + case memory_order_seq_cst: + return __ATOMIC_SEQ_CST; + case memory_order_acq_rel: + return __ATOMIC_ACQUIRE; + } } static_assert((is_same_v::type, __memory_order_underlying_t>), diff --git a/libcudacxx/include/cuda/std/__atomic/types/base.h b/libcudacxx/include/cuda/std/__atomic/types/base.h index aacd8453c8a..cd8d2cb4983 100644 --- a/libcudacxx/include/cuda/std/__atomic/types/base.h +++ b/libcudacxx/include/cuda/std/__atomic/types/base.h @@ -21,7 +21,8 @@ # pragma system_header #endif // no system header -#include +#include +#include #include #include @@ -69,7 +70,7 @@ _CCCL_API inline void __atomic_thread_fence_dispatch(memory_order __order) { NV_DISPATCH_TARGET( NV_IS_DEVICE, - (__atomic_thread_fence_cuda(static_cast<__memory_order_underlying_t>(__order), __thread_scope_system_tag());), + (__atomic_thread_fence_cuda(::cuda::std::__atomic_order_to_int(__order), __thread_scope_system_tag());), NV_IS_HOST, (__atomic_thread_fence_host(__order);)) } @@ -77,7 +78,7 @@ _CCCL_API inline void __atomic_thread_fence_dispatch(memory_order __order) _CCCL_API inline void __atomic_signal_fence_dispatch(memory_order __order) { NV_DISPATCH_TARGET(NV_IS_DEVICE, - (__atomic_signal_fence_cuda(static_cast<__memory_order_underlying_t>(__order));), + (__atomic_signal_fence_cuda(::cuda::std::__atomic_order_to_int(__order));), NV_IS_HOST, (__atomic_signal_fence_host(__order);)) } @@ -91,21 +92,20 @@ _CCCL_API void __atomic_init_dispatch(_Sto* __a, _Up __val) template = 0> _CCCL_API void __atomic_store_dispatch(_Sto* __a, _Up __val, memory_order __order, _Sco = {}) { - NV_DISPATCH_TARGET( - NV_IS_DEVICE, - (__atomic_store_n_cuda(__a->get(), __val, static_cast<__memory_order_underlying_t>(__order), _Sco{});), - NV_IS_HOST, - (__atomic_store_host(__a->get(), __val, __order);)) + NV_DISPATCH_TARGET(NV_IS_DEVICE, + (__atomic_store_n_cuda(__a->get(), __val, ::cuda::std::__atomic_order_to_int(__order), _Sco{});), + NV_IS_HOST, + (__atomic_store_host(__a->get(), __val, __order);)) } template = 0> _CCCL_API auto __atomic_load_dispatch(const _Sto* __a, memory_order __order, _Sco = {}) -> __atomic_underlying_t<_Sto> { - NV_DISPATCH_TARGET( - NV_IS_DEVICE, - (return __atomic_load_n_cuda(__a->get(), static_cast<__memory_order_underlying_t>(__order), _Sco{});), - NV_IS_HOST, - (return __atomic_load_host(__a->get(), __order);)) + NV_DISPATCH_TARGET(NV_IS_DEVICE, + (return __atomic_load_n_cuda(__a->get(), ::cuda::std::__atomic_order_to_int(__order), _Sco{});), + NV_IS_HOST, + (return __atomic_load_host(__a->get(), __order);)) + _CCCL_UNREACHABLE(); } template = 0> @@ -114,7 +114,7 @@ _CCCL_API auto __atomic_exchange_dispatch(_Sto* __a, _Up __value, memory_order _ { NV_DISPATCH_TARGET( NV_IS_DEVICE, - (return __atomic_exchange_n_cuda(__a->get(), __value, static_cast<__memory_order_underlying_t>(__order), _Sco{});), + (return __atomic_exchange_n_cuda(__a->get(), __value, ::cuda::std::__atomic_order_to_int(__order), _Sco{});), NV_IS_HOST, (return __atomic_exchange_host(__a->get(), __value, __order);)) } @@ -131,8 +131,8 @@ _CCCL_API bool __atomic_compare_exchange_strong_dispatch( __expected, __val, false, - static_cast<__memory_order_underlying_t>(__success), - static_cast<__memory_order_underlying_t>(__failure), + ::cuda::std::__atomic_order_to_int(__success), + ::cuda::std::__atomic_order_to_int(__failure), _Sco{});), NV_IS_HOST, (__result = __atomic_compare_exchange_strong_host(__a->get(), __expected, __val, __success, __failure);)) @@ -151,8 +151,8 @@ _CCCL_API bool __atomic_compare_exchange_weak_dispatch( __expected, __val, true, - static_cast<__memory_order_underlying_t>(__success), - static_cast<__memory_order_underlying_t>(__failure), + ::cuda::std::__atomic_order_to_int(__success), + ::cuda::std::__atomic_order_to_int(__failure), _Sco{});), NV_IS_HOST, (__result = __atomic_compare_exchange_weak_host(__a->get(), __expected, __val, __success, __failure);)) @@ -165,7 +165,7 @@ _CCCL_API auto __atomic_fetch_add_dispatch(_Sto* __a, _Up __delta, memory_order { NV_DISPATCH_TARGET( NV_IS_DEVICE, - (return __atomic_fetch_add_cuda(__a->get(), __delta, static_cast<__memory_order_underlying_t>(__order), _Sco{});), + (return __atomic_fetch_add_cuda(__a->get(), __delta, ::cuda::std::__atomic_order_to_int(__order), _Sco{});), NV_IS_HOST, (return __atomic_fetch_add_host(__a->get(), __delta, __order);)) } @@ -176,7 +176,7 @@ _CCCL_API auto __atomic_fetch_sub_dispatch(_Sto* __a, _Up __delta, memory_order { NV_DISPATCH_TARGET( NV_IS_DEVICE, - (return __atomic_fetch_sub_cuda(__a->get(), __delta, static_cast<__memory_order_underlying_t>(__order), _Sco{});), + (return __atomic_fetch_sub_cuda(__a->get(), __delta, ::cuda::std::__atomic_order_to_int(__order), _Sco{});), NV_IS_HOST, (return __atomic_fetch_sub_host(__a->get(), __delta, __order);)) } @@ -187,7 +187,7 @@ _CCCL_API auto __atomic_fetch_and_dispatch(_Sto* __a, _Up __pattern, memory_orde { NV_DISPATCH_TARGET( NV_IS_DEVICE, - (return __atomic_fetch_and_cuda(__a->get(), __pattern, static_cast<__memory_order_underlying_t>(__order), _Sco{});), + (return __atomic_fetch_and_cuda(__a->get(), __pattern, ::cuda::std::__atomic_order_to_int(__order), _Sco{});), NV_IS_HOST, (return __atomic_fetch_and_host(__a->get(), __pattern, __order);)) } @@ -198,7 +198,7 @@ _CCCL_API auto __atomic_fetch_or_dispatch(_Sto* __a, _Up __pattern, memory_order { NV_DISPATCH_TARGET( NV_IS_DEVICE, - (return __atomic_fetch_or_cuda(__a->get(), __pattern, static_cast<__memory_order_underlying_t>(__order), _Sco{});), + (return __atomic_fetch_or_cuda(__a->get(), __pattern, ::cuda::std::__atomic_order_to_int(__order), _Sco{});), NV_IS_HOST, (return __atomic_fetch_or_host(__a->get(), __pattern, __order);)) } @@ -209,7 +209,7 @@ _CCCL_API auto __atomic_fetch_xor_dispatch(_Sto* __a, _Up __pattern, memory_orde { NV_DISPATCH_TARGET( NV_IS_DEVICE, - (return __atomic_fetch_xor_cuda(__a->get(), __pattern, static_cast<__memory_order_underlying_t>(__order), _Sco{});), + (return __atomic_fetch_xor_cuda(__a->get(), __pattern, ::cuda::std::__atomic_order_to_int(__order), _Sco{});), NV_IS_HOST, (return __atomic_fetch_xor_host(__a->get(), __pattern, __order);)) } @@ -220,7 +220,7 @@ _CCCL_API auto __atomic_fetch_max_dispatch(_Sto* __a, _Up __val, memory_order __ { NV_IF_TARGET( NV_IS_DEVICE, - (return __atomic_fetch_max_cuda(__a->get(), __val, static_cast<__memory_order_underlying_t>(__order), _Sco{});), + (return __atomic_fetch_max_cuda(__a->get(), __val, ::cuda::std::__atomic_order_to_int(__order), _Sco{});), (return __atomic_fetch_max_host(__a->get(), __val, __order);)) } @@ -230,7 +230,7 @@ _CCCL_API auto __atomic_fetch_min_dispatch(_Sto* __a, _Up __val, memory_order __ { NV_IF_TARGET( NV_IS_DEVICE, - (return __atomic_fetch_min_cuda(__a->get(), __val, static_cast<__memory_order_underlying_t>(__order), _Sco{});), + (return __atomic_fetch_min_cuda(__a->get(), __val, ::cuda::std::__atomic_order_to_int(__order), _Sco{});), (return __atomic_fetch_min_host(__a->get(), __val, __order);)) } diff --git a/libcudacxx/include/cuda/std/__cccl/preprocessor.h b/libcudacxx/include/cuda/std/__cccl/preprocessor.h index 8f40726c10c..61610a2941f 100644 --- a/libcudacxx/include/cuda/std/__cccl/preprocessor.h +++ b/libcudacxx/include/cuda/std/__cccl/preprocessor.h @@ -38,9 +38,13 @@ CCCL_IGNORE_MSVC_TRADITIONAL_PREPROCESSOR_WARNING to suppress this warning. #define _CCCL_PP_SECOND(_, second, ...) second #define _CCCL_PP_THIRD(_1, _2, third) third -#define _CCCL_PP_EXPAND(...) __VA_ARGS__ +#define _CCCL_PP_EXPAND1(...) __VA_ARGS__ +#define _CCCL_PP_EXPAND(...) _CCCL_PP_EXPAND1(_CCCL_PP_EXPAND1(__VA_ARGS__)) #define _CCCL_PP_EAT(...) +#define _CCCL_PP_DEFER(id) id _CCCL_PP_EAT() +#define _CCCL_PP_OBSTRUCT(...) __VA_ARGS__ _CCCL_PP_DEFER(_CCCL_PP_EAT)() + #define _CCCL_PP_CAT_(_Xp, ...) _Xp##__VA_ARGS__ #define _CCCL_PP_CAT(_Xp, ...) _CCCL_PP_CAT_(_Xp, __VA_ARGS__) diff --git a/libcudacxx/include/cuda/std/atomic b/libcudacxx/include/cuda/std/atomic index b6619996d43..221c699dc5d 100644 --- a/libcudacxx/include/cuda/std/atomic +++ b/libcudacxx/include/cuda/std/atomic @@ -559,7 +559,6 @@ atomic_fetch_xor_explicit(atomic<_Tp>* __o, _Tp __op, memory_order __m) noexcept } // flag type and operations - struct atomic_flag { __atomic_storage_t<_CCCL_ATOMIC_FLAG_TYPE> __a; @@ -730,7 +729,6 @@ _CCCL_API inline void atomic_flag_notify_all(atomic_flag* __o) noexcept } #endif - // fences _CCCL_API inline void atomic_thread_fence(memory_order __m) noexcept