diff --git a/libcudacxx/codegen/CMakeLists.txt b/libcudacxx/codegen/CMakeLists.txt
index 4e95fdbedcf..b64d6a47d3d 100644
--- a/libcudacxx/codegen/CMakeLists.txt
+++ b/libcudacxx/codegen/CMakeLists.txt
@@ -8,14 +8,12 @@ add_executable(codegen EXCLUDE_FROM_ALL codegen.cpp)
 
 target_compile_features(codegen PRIVATE cxx_std_20)
 
-set(
-  atomic_generated_output
-  "${libcudacxx_BINARY_DIR}/codegen/cuda_ptx_generated.h"
-)
-set(
-  atomic_install_location
-  "${libcudacxx_SOURCE_DIR}/include/cuda/std/__atomic/functions"
-)
+set(atomic_generated_output "${libcudacxx_BINARY_DIR}/codegen/cuda_ptx_generated.h")
+set(atomic_install_location "${libcudacxx_SOURCE_DIR}/include/cuda/std/__atomic/backends/")
+
+target_link_libraries(codegen PRIVATE fmt)
+
+set_property(TARGET codegen PROPERTY CXX_STANDARD 17)
 
 add_custom_target(
   libcudacxx.atomics.codegen
diff --git a/libcudacxx/codegen/generators/header.h b/libcudacxx/codegen/generators/header.h
index 41a9b5cdf72..4a97dfd2755 100644
--- a/libcudacxx/codegen/generators/header.h
+++ b/libcudacxx/codegen/generators/header.h
@@ -28,8 +28,8 @@ inline void FormatHeader(std::ostream& out)
 // This is an autogenerated file, we want to ensure that it contains exactly the contents we want to generate
 // clang-format off
 
-#ifndef _CUDA_STD___ATOMIC_FUNCTIONS_CUDA_PTX_GENERATED_H
-#define _CUDA_STD___ATOMIC_FUNCTIONS_CUDA_PTX_GENERATED_H
+#ifndef _CUDA_STD___ATOMIC_BACKENDS_CUDA_PTX_GENERATED_H
+#define _CUDA_STD___ATOMIC_BACKENDS_CUDA_PTX_GENERATED_H
 
 #include <cuda/std/detail/__config>
 
@@ -50,9 +50,9 @@ inline void FormatHeader(std::ostream& out)
 
 #include <cuda/std/__atomic/scopes.h>
 #include <cuda/std/__atomic/order.h>
-#include <cuda/std/__atomic/functions/common.h>
-#include <cuda/std/__atomic/functions/cuda_ptx_generated_helper.h>
-#include <cuda/std/__atomic/functions/cuda_local.h>
+#include <cuda/std/__atomic/backends/common.h>
+#include <cuda/std/__atomic/backends/cuda_supported_atomics_helper.h>
+#include <cuda/std/__atomic/backends/cuda_local.h>
 
 #include <cuda/std/__cccl/prologue.h>
 
@@ -77,7 +77,7 @@ _CCCL_END_NAMESPACE_CUDA_STD
 
 #include <cuda/std/__cccl/epilogue.h>
 
-#endif // _CUDA_STD___ATOMIC_FUNCTIONS_CUDA_PTX_GENERATED_H
+#endif // _CUDA_STD___ATOMIC_BACKENDS_CUDA_PTX_GENERATED_H
 
 // clang-format on
 )XXX";
diff --git a/libcudacxx/codegen/generators/ld_st.h b/libcudacxx/codegen/generators/ld_st.h
index d836d80f78a..b42ebd94eec 100644
--- a/libcudacxx/codegen/generators/ld_st.h
+++ b/libcudacxx/codegen/generators/ld_st.h
@@ -391,7 +391,7 @@ static inline _CCCL_DEVICE void __atomic_store_cuda(_Type* __ptr, _Type& __val,
   __cuda_atomic_store_memory_order_dispatch(__bound_store, __memorder, _Sco{});
 }
 template <class _Type, class _Sco>
-static inline _CCCL_DEVICE void __atomic_store_cuda(volatile _Type* __ptr, _Type& __val, int __memorder, _Sco)
+static inline _CCCL_DEVICE void __atomic_store_cuda(_Type volatile* __ptr, _Type& __val, int __memorder, _Sco)
 {
   using __proxy_t        = typename __atomic_cuda_deduce_bitwise<_Type>::__type;
   using __proxy_tag      = typename __atomic_cuda_deduce_bitwise<_Type>::__tag;
diff --git a/libcudacxx/include/cuda/std/__atomic/functions.h b/libcudacxx/include/cuda/std/__atomic/backends.h
similarity index 68%
rename from libcudacxx/include/cuda/std/__atomic/functions.h
rename to libcudacxx/include/cuda/std/__atomic/backends.h
index b8de1c70176..96fff6a85b3 100644
--- a/libcudacxx/include/cuda/std/__atomic/functions.h
+++ b/libcudacxx/include/cuda/std/__atomic/backends.h
@@ -8,8 +8,8 @@
 //
 //===----------------------------------------------------------------------===//
 
-#ifndef __CUDA_STD___ATOMIC_FUNCTIONS_H
-#define __CUDA_STD___ATOMIC_FUNCTIONS_H
+#ifndef __CUDA_STD___ATOMIC_BACKENDS_H
+#define __CUDA_STD___ATOMIC_BACKENDS_H
 
 #include <cuda/std/detail/__config>
 
@@ -22,12 +22,15 @@
 #endif // no system header
 
 #include <cuda/std/__atomic/platform.h>
-
+#if _CCCL_CUDA_COMPILER(NVCC, >, 12, 8)
+#  include <cuda/std/__atomic/backends/cuda_nvvm.h>
+#else
 // Device atomics
-#include <cuda/std/__atomic/functions/cuda_ptx_derived.h>
-#include <cuda/std/__atomic/functions/cuda_ptx_generated.h>
+#  include <cuda/std/__atomic/backends/cuda_ptx_generated.h>
+#endif
+#include <cuda/std/__atomic/backends/cuda_ptx_derived.h>
 
 // Host atomics
-#include <cuda/std/__atomic/functions/host.h>
+#include <cuda/std/__atomic/backends/host.h>
 
-#endif // __CUDA_STD___ATOMIC_FUNCTIONS_H
+#endif // __CUDA_STD___ATOMIC_BACKENDS_H
diff --git a/libcudacxx/include/cuda/std/__atomic/functions/common.h b/libcudacxx/include/cuda/std/__atomic/backends/common.h
similarity index 91%
rename from libcudacxx/include/cuda/std/__atomic/functions/common.h
rename to libcudacxx/include/cuda/std/__atomic/backends/common.h
index 07ece7c4abe..7b097e3ef59 100644
--- a/libcudacxx/include/cuda/std/__atomic/functions/common.h
+++ b/libcudacxx/include/cuda/std/__atomic/backends/common.h
@@ -8,8 +8,8 @@
 //
 //===----------------------------------------------------------------------===//
 
-#ifndef _CUDA_STD___ATOMIC_FUNCTIONS_COMMON_H
-#define _CUDA_STD___ATOMIC_FUNCTIONS_COMMON_H
+#ifndef _CUDA_STD___ATOMIC_BACKENDS_COMMON_H
+#define _CUDA_STD___ATOMIC_BACKENDS_COMMON_H
 
 #include <cuda/std/detail/__config>
 
@@ -55,4 +55,4 @@ _CCCL_END_NAMESPACE_CUDA_STD
 
 #include <cuda/std/__cccl/epilogue.h>
 
-#endif // _CUDA_STD___ATOMIC_FUNCTIONS_COMMON_H
+#endif // _CUDA_STD___ATOMIC_BACKENDS_COMMON_H
diff --git a/libcudacxx/include/cuda/std/__atomic/functions/cuda_local.h b/libcudacxx/include/cuda/std/__atomic/backends/cuda_local.h
similarity index 97%
rename from libcudacxx/include/cuda/std/__atomic/functions/cuda_local.h
rename to libcudacxx/include/cuda/std/__atomic/backends/cuda_local.h
index c6b07cfbf61..9ea7c1d3173 100644
--- a/libcudacxx/include/cuda/std/__atomic/functions/cuda_local.h
+++ b/libcudacxx/include/cuda/std/__atomic/backends/cuda_local.h
@@ -7,8 +7,8 @@
 // SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES.
 //
 //===----------------------------------------------------------------------===//
-#ifndef __CUDA_STD___ATOMIC_FUNCTIONS_CUDA_LOCAL_H
-#define __CUDA_STD___ATOMIC_FUNCTIONS_CUDA_LOCAL_H
+#ifndef __CUDA_STD___ATOMIC_BACKENDS_CUDA_LOCAL_H
+#define __CUDA_STD___ATOMIC_BACKENDS_CUDA_LOCAL_H
 
 #include <cuda/std/detail/__config>
 
@@ -205,4 +205,4 @@ _CCCL_END_NAMESPACE_CUDA_STD
 
 #include <cuda/std/__cccl/epilogue.h>
 
-#endif // __CUDA_STD___ATOMIC_FUNCTIONS_CUDA_LOCAL_H
+#endif // __CUDA_STD___ATOMIC_BACKENDS_CUDA_LOCAL_H
diff --git a/libcudacxx/include/cuda/std/__atomic/backends/cuda_nvvm.h b/libcudacxx/include/cuda/std/__atomic/backends/cuda_nvvm.h
new file mode 100644
index 00000000000..3c096de7317
--- /dev/null
+++ b/libcudacxx/include/cuda/std/__atomic/backends/cuda_nvvm.h
@@ -0,0 +1,421 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of libcu++, the C++ Standard Library for your entire system,
+// under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+// SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef _CUDA_STD___ATOMIC_BACKENDS_CUDA_NVVM_H
+#  define _CUDA_STD___ATOMIC_BACKENDS_CUDA_NVVM_H
+
+#  include <cuda/std/detail/__config>
+
+#  if defined(_CCCL_IMPLICIT_SYSTEM_HEADER_GCC)
+#    pragma GCC system_header
+#  elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_CLANG)
+#    pragma clang system_header
+#  elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_MSVC)
+#    pragma system_header
+#  endif // no system header
+
+#  include <cuda/std/__atomic/backends/common.h>
+#  include <cuda/std/__atomic/backends/cuda_local.h>
+#  include <cuda/std/__atomic/backends/cuda_nvvm_wrapped.h>
+#  include <cuda/std/__atomic/backends/cuda_supported_atomics_helper.h>
+#  include <cuda/std/__atomic/order.h>
+#  include <cuda/std/__atomic/scopes.h>
+#  include <cuda/std/__type_traits/enable_if.h>
+#  include <cuda/std/__type_traits/is_signed.h>
+#  include <cuda/std/__type_traits/is_unsigned.h>
+#  include <cuda/std/cassert>
+#  include <cuda/std/cstdint>
+
+#  include <cuda/std/__cccl/prologue.h>
+
+_CCCL_BEGIN_NAMESPACE_CUDA_STD
+
+#  if _CCCL_CUDA_COMPILATION()
+
+constexpr _CCCL_DEVICE int __atomic_scope_tag_to_nvvm_scope(::cuda::std::__thread_scope_system_tag)
+{
+  return ::__NV_THREAD_SCOPE_SYSTEM;
+}
+constexpr _CCCL_DEVICE int __atomic_scope_tag_to_nvvm_scope(::cuda::std::__thread_scope_device_tag)
+{
+  return ::__NV_THREAD_SCOPE_DEVICE;
+}
+constexpr _CCCL_DEVICE int __atomic_scope_tag_to_nvvm_scope(::cuda::std::__thread_scope_cluster_tag)
+{
+  return ::__NV_THREAD_SCOPE_CLUSTER;
+}
+constexpr _CCCL_DEVICE int __atomic_scope_tag_to_nvvm_scope(::cuda::std::__thread_scope_block_tag)
+{
+  return ::__NV_THREAD_SCOPE_BLOCK;
+}
+constexpr _CCCL_DEVICE int __atomic_scope_tag_to_nvvm_scope(::cuda::std::__thread_scope_thread_tag)
+{
+  return ::__NV_THREAD_SCOPE_THREAD;
+}
+
+template <typename _Sco>
+static inline _CCCL_DEVICE void __atomic_thread_fence_cuda(int __memorder, _Sco)
+{
+  __atomic_thread_fence_nvvm_dispatch(__memorder, __atomic_scope_tag_to_nvvm_scope(_Sco{}));
+}
+
+template <class _Type, class _Sco>
+static inline _CCCL_DEVICE void __atomic_load_cuda(const _Type* __ptr, _Type& __dst, int __memorder, _Sco)
+{
+  using __proxy_t              = typename __atomic_cuda_deduce_bitwise<_Type>::__type;
+  using __proxy_tag            = typename __atomic_cuda_deduce_bitwise<_Type>::__tag;
+  const __proxy_t* __ptr_proxy = reinterpret_cast<const __proxy_t*>(__ptr);
+  __proxy_t* __dst_proxy       = reinterpret_cast<__proxy_t*>(&__dst);
+  if (__cuda_load_weak_if_local(__ptr_proxy, __dst_proxy, sizeof(__proxy_t)))
+  {
+    return;
+  }
+  __atomic_load_nvvm_dispatch(__ptr_proxy, __dst_proxy, __memorder, __atomic_scope_tag_to_nvvm_scope(_Sco{}));
+}
+template <class _Type, class _Sco>
+static inline _CCCL_DEVICE void __atomic_load_cuda(const _Type volatile* __ptr, _Type& __dst, int __memorder, _Sco)
+{
+  using __proxy_t              = typename __atomic_cuda_deduce_bitwise<_Type>::__type;
+  using __proxy_tag            = typename __atomic_cuda_deduce_bitwise<_Type>::__tag;
+  const __proxy_t* __ptr_proxy = reinterpret_cast<const __proxy_t*>(const_cast<_Type*>(__ptr));
+  __proxy_t* __dst_proxy       = reinterpret_cast<__proxy_t*>(&__dst);
+  if (__cuda_load_weak_if_local(__ptr_proxy, __dst_proxy, sizeof(__proxy_t)))
+  {
+    return;
+  }
+  __atomic_load_nvvm_dispatch(__ptr_proxy, __dst_proxy, __memorder, __atomic_scope_tag_to_nvvm_scope(_Sco{}));
+}
+
+template <class _Type, class _Sco>
+static inline _CCCL_DEVICE void __atomic_store_cuda(_Type* __ptr, _Type& __val, int __memorder, _Sco)
+{
+  using __proxy_t        = typename __atomic_cuda_deduce_bitwise<_Type>::__type;
+  using __proxy_tag      = typename __atomic_cuda_deduce_bitwise<_Type>::__tag;
+  __proxy_t* __ptr_proxy = reinterpret_cast<__proxy_t*>(__ptr);
+  __proxy_t* __val_proxy = reinterpret_cast<__proxy_t*>(&__val);
+  if (__cuda_store_weak_if_local(__ptr_proxy, __val_proxy, sizeof(__proxy_t)))
+  {
+    return;
+  }
+  __atomic_store_nvvm_dispatch(__ptr_proxy, __val_proxy, __memorder, __atomic_scope_tag_to_nvvm_scope(_Sco{}));
+}
+template <class _Type, class _Sco>
+static inline _CCCL_DEVICE void __atomic_store_cuda(_Type volatile* __ptr, _Type& __val, int __memorder, _Sco)
+{
+  using __proxy_t        = typename __atomic_cuda_deduce_bitwise<_Type>::__type;
+  using __proxy_tag      = typename __atomic_cuda_deduce_bitwise<_Type>::__tag;
+  __proxy_t* __ptr_proxy = reinterpret_cast<__proxy_t*>(const_cast<_Type*>(__ptr));
+  __proxy_t* __val_proxy = reinterpret_cast<__proxy_t*>(&__val);
+  if (__cuda_store_weak_if_local(__ptr_proxy, __val_proxy, sizeof(__proxy_t)))
+  {
+    return;
+  }
+  __atomic_store_nvvm_dispatch(__ptr_proxy, __val_proxy, __memorder, __atomic_scope_tag_to_nvvm_scope(_Sco{}));
+}
+
+template <class _Type, class _Sco>
+static inline _CCCL_DEVICE bool __atomic_compare_exchange_cuda(
+  _Type* __ptr, _Type* __exp, _Type __des, bool __weak, int __success_memorder, int __failure_memorder, _Sco)
+{
+  using __proxy_t        = typename __atomic_cuda_deduce_bitwise<_Type>::__type;
+  using __proxy_tag      = typename __atomic_cuda_deduce_bitwise<_Type>::__tag;
+  __proxy_t* __ptr_proxy = reinterpret_cast<__proxy_t*>(__ptr);
+  __proxy_t* __exp_proxy = reinterpret_cast<__proxy_t*>(__exp);
+  __proxy_t* __des_proxy = reinterpret_cast<__proxy_t*>(&__des);
+  bool __res             = false;
+  if (__cuda_compare_exchange_weak_if_local(__ptr_proxy, __exp_proxy, __des_proxy, &__res))
+  {
+    return __res;
+  }
+  return __atomic_compare_exchange_nvvm_dispatch(
+    __ptr_proxy,
+    __exp_proxy,
+    __des_proxy,
+    __weak,
+    __success_memorder,
+    __failure_memorder,
+    __atomic_scope_tag_to_nvvm_scope(_Sco{}));
+}
+template <class _Type, class _Sco>
+static inline _CCCL_DEVICE bool __atomic_compare_exchange_cuda(
+  _Type volatile* __ptr, _Type* __exp, _Type __des, bool __weak, int __success_memorder, int __failure_memorder, _Sco)
+{
+  using __proxy_t        = typename __atomic_cuda_deduce_bitwise<_Type>::__type;
+  using __proxy_tag      = typename __atomic_cuda_deduce_bitwise<_Type>::__tag;
+  __proxy_t* __ptr_proxy = reinterpret_cast<__proxy_t*>(const_cast<_Type*>(__ptr));
+  __proxy_t* __exp_proxy = reinterpret_cast<__proxy_t*>(__exp);
+  __proxy_t* __des_proxy = reinterpret_cast<__proxy_t*>(&__des);
+  bool __res             = false;
+  if (__cuda_compare_exchange_weak_if_local(__ptr_proxy, __exp_proxy, __des_proxy, &__res))
+  {
+    return __res;
+  }
+  return __atomic_compare_exchange_nvvm_dispatch(
+    __ptr_proxy,
+    __exp_proxy,
+    __des_proxy,
+    __weak,
+    __success_memorder,
+    __failure_memorder,
+    __atomic_scope_tag_to_nvvm_scope(_Sco{}));
+}
+
+template <class _Type, class _Sco>
+static inline _CCCL_DEVICE void __atomic_exchange_cuda(_Type* __ptr, _Type& __old, _Type __new, int __memorder, _Sco)
+{
+  using __proxy_t        = typename __atomic_cuda_deduce_bitwise<_Type>::__type;
+  using __proxy_tag      = typename __atomic_cuda_deduce_bitwise<_Type>::__tag;
+  __proxy_t* __ptr_proxy = reinterpret_cast<__proxy_t*>(__ptr);
+  __proxy_t* __old_proxy = reinterpret_cast<__proxy_t*>(&__old);
+  __proxy_t* __new_proxy = reinterpret_cast<__proxy_t*>(&__new);
+  if (__cuda_exchange_weak_if_local(__ptr_proxy, __new_proxy, __old_proxy))
+  {
+    return;
+  }
+  __atomic_exchange_nvvm_dispatch(
+    __ptr_proxy, __new_proxy, __old_proxy, __memorder, __atomic_scope_tag_to_nvvm_scope(_Sco{}));
+}
+template <class _Type, class _Sco>
+static inline _CCCL_DEVICE void
+__atomic_exchange_cuda(_Type volatile* __ptr, _Type& __old, _Type __new, int __memorder, _Sco)
+{
+  using __proxy_t        = typename __atomic_cuda_deduce_bitwise<_Type>::__type;
+  using __proxy_tag      = typename __atomic_cuda_deduce_bitwise<_Type>::__tag;
+  __proxy_t* __ptr_proxy = reinterpret_cast<__proxy_t*>(const_cast<_Type*>(__ptr));
+  __proxy_t* __old_proxy = reinterpret_cast<__proxy_t*>(&__old);
+  __proxy_t* __new_proxy = reinterpret_cast<__proxy_t*>(&__new);
+  if (__cuda_exchange_weak_if_local(__ptr_proxy, __new_proxy, __old_proxy))
+  {
+    return;
+  }
+  __atomic_exchange_nvvm_dispatch(
+    __ptr_proxy, __new_proxy, __old_proxy, __memorder, __atomic_scope_tag_to_nvvm_scope(_Sco{}));
+}
+
+template <class _Type, class _Up, class _Sco, __atomic_enable_if_native_bitwise<_Type> = 0>
+static inline _CCCL_DEVICE _Type __atomic_fetch_and_cuda(_Type* __ptr, _Up __op, int __memorder, _Sco)
+{
+  using __proxy_t   = typename __atomic_cuda_deduce_bitwise<_Type>::__type;
+  using __proxy_tag = typename __atomic_cuda_deduce_bitwise<_Type>::__tag;
+  _Type __dst{};
+  __proxy_t* __ptr_proxy = reinterpret_cast<__proxy_t*>(__ptr);
+  __proxy_t* __dst_proxy = reinterpret_cast<__proxy_t*>(&__dst);
+  __proxy_t* __op_proxy  = reinterpret_cast<__proxy_t*>(&__op);
+  if (__cuda_fetch_and_weak_if_local(__ptr_proxy, *__op_proxy, __dst_proxy))
+  {
+    return __dst;
+  }
+  return __atomic_fetch_and_nvvm_dispatch(
+    __ptr_proxy, *__op_proxy, __memorder, __atomic_scope_tag_to_nvvm_scope(_Sco{}));
+}
+template <class _Type, class _Up, class _Sco, __atomic_enable_if_native_bitwise<_Type> = 0>
+static inline _CCCL_DEVICE _Type __atomic_fetch_and_cuda(_Type volatile* __ptr, _Up __op, int __memorder, _Sco)
+{
+  using __proxy_t   = typename __atomic_cuda_deduce_bitwise<_Type>::__type;
+  using __proxy_tag = typename __atomic_cuda_deduce_bitwise<_Type>::__tag;
+  _Type __dst{};
+  __proxy_t* __ptr_proxy = reinterpret_cast<__proxy_t*>(const_cast<_Type*>(__ptr));
+  __proxy_t* __dst_proxy = reinterpret_cast<__proxy_t*>(&__dst);
+  __proxy_t* __op_proxy  = reinterpret_cast<__proxy_t*>(&__op);
+  if (__cuda_fetch_and_weak_if_local(__ptr_proxy, *__op_proxy, __dst_proxy))
+  {
+    return __dst;
+  }
+  return __atomic_fetch_and_nvvm_dispatch(
+    __ptr_proxy, *__op_proxy, __memorder, __atomic_scope_tag_to_nvvm_scope(_Sco{}));
+}
+
+template <class _Type, class _Up, class _Sco, __atomic_enable_if_native_minmax<_Type> = 0>
+static inline _CCCL_DEVICE _Type __atomic_fetch_max_cuda(_Type* __ptr, _Up __op, int __memorder, _Sco)
+{
+  using __proxy_t   = typename __atomic_cuda_deduce_minmax<_Type>::__type;
+  using __proxy_tag = typename __atomic_cuda_deduce_minmax<_Type>::__tag;
+  _Type __dst{};
+  __proxy_t* __ptr_proxy = reinterpret_cast<__proxy_t*>(__ptr);
+  __proxy_t* __dst_proxy = reinterpret_cast<__proxy_t*>(&__dst);
+  __proxy_t* __op_proxy  = reinterpret_cast<__proxy_t*>(&__op);
+  if (__cuda_fetch_max_weak_if_local(__ptr_proxy, *__op_proxy, __dst_proxy))
+  {
+    return __dst;
+  }
+  return __atomic_fetch_max_nvvm_dispatch(
+    __ptr_proxy, *__op_proxy, __memorder, __atomic_scope_tag_to_nvvm_scope(_Sco{}));
+}
+template <class _Type, class _Up, class _Sco, __atomic_enable_if_native_minmax<_Type> = 0>
+static inline _CCCL_DEVICE _Type __atomic_fetch_max_cuda(_Type volatile* __ptr, _Up __op, int __memorder, _Sco)
+{
+  using __proxy_t   = typename __atomic_cuda_deduce_minmax<_Type>::__type;
+  using __proxy_tag = typename __atomic_cuda_deduce_minmax<_Type>::__tag;
+  _Type __dst{};
+  __proxy_t* __ptr_proxy = reinterpret_cast<__proxy_t*>(const_cast<_Type*>(__ptr));
+  __proxy_t* __dst_proxy = reinterpret_cast<__proxy_t*>(&__dst);
+  __proxy_t* __op_proxy  = reinterpret_cast<__proxy_t*>(&__op);
+  if (__cuda_fetch_max_weak_if_local(__ptr_proxy, *__op_proxy, __dst_proxy))
+  {
+    return __dst;
+  }
+  return __atomic_fetch_max_nvvm_dispatch(
+    __ptr_proxy, *__op_proxy, __memorder, __atomic_scope_tag_to_nvvm_scope(_Sco{}));
+}
+
+template <class _Type, class _Up, class _Sco, __atomic_enable_if_native_minmax<_Type> = 0>
+static inline _CCCL_DEVICE _Type __atomic_fetch_min_cuda(_Type* __ptr, _Up __op, int __memorder, _Sco)
+{
+  using __proxy_t   = typename __atomic_cuda_deduce_minmax<_Type>::__type;
+  using __proxy_tag = typename __atomic_cuda_deduce_minmax<_Type>::__tag;
+  _Type __dst{};
+  __proxy_t* __ptr_proxy = reinterpret_cast<__proxy_t*>(__ptr);
+  __proxy_t* __dst_proxy = reinterpret_cast<__proxy_t*>(&__dst);
+  __proxy_t* __op_proxy  = reinterpret_cast<__proxy_t*>(&__op);
+  if (__cuda_fetch_min_weak_if_local(__ptr_proxy, *__op_proxy, __dst_proxy))
+  {
+    return __dst;
+  }
+  return __atomic_fetch_min_nvvm_dispatch(
+    __ptr_proxy, *__op_proxy, __memorder, __atomic_scope_tag_to_nvvm_scope(_Sco{}));
+}
+template <class _Type, class _Up, class _Sco, __atomic_enable_if_native_minmax<_Type> = 0>
+static inline _CCCL_DEVICE _Type __atomic_fetch_min_cuda(_Type volatile* __ptr, _Up __op, int __memorder, _Sco)
+{
+  using __proxy_t   = typename __atomic_cuda_deduce_minmax<_Type>::__type;
+  using __proxy_tag = typename __atomic_cuda_deduce_minmax<_Type>::__tag;
+  _Type __dst{};
+  __proxy_t* __ptr_proxy = reinterpret_cast<__proxy_t*>(const_cast<_Type*>(__ptr));
+  __proxy_t* __dst_proxy = reinterpret_cast<__proxy_t*>(&__dst);
+  __proxy_t* __op_proxy  = reinterpret_cast<__proxy_t*>(&__op);
+  if (__cuda_fetch_min_weak_if_local(__ptr_proxy, *__op_proxy, __dst_proxy))
+  {
+    return __dst;
+  }
+  return __atomic_fetch_min_nvvm_dispatch(
+    __ptr_proxy, *__op_proxy, __memorder, __atomic_scope_tag_to_nvvm_scope(_Sco{}));
+}
+
+template <class _Type, class _Up, class _Sco, __atomic_enable_if_native_bitwise<_Type> = 0>
+static inline _CCCL_DEVICE _Type __atomic_fetch_or_cuda(_Type* __ptr, _Up __op, int __memorder, _Sco)
+{
+  using __proxy_t   = typename __atomic_cuda_deduce_bitwise<_Type>::__type;
+  using __proxy_tag = typename __atomic_cuda_deduce_bitwise<_Type>::__tag;
+  _Type __dst{};
+  __proxy_t* __ptr_proxy = reinterpret_cast<__proxy_t*>(__ptr);
+  __proxy_t* __dst_proxy = reinterpret_cast<__proxy_t*>(&__dst);
+  __proxy_t* __op_proxy  = reinterpret_cast<__proxy_t*>(&__op);
+  if (__cuda_fetch_or_weak_if_local(__ptr_proxy, *__op_proxy, __dst_proxy))
+  {
+    return __dst;
+  }
+  return __atomic_fetch_or_nvvm_dispatch(__ptr_proxy, *__op_proxy, __memorder, __atomic_scope_tag_to_nvvm_scope(_Sco{}));
+}
+template <class _Type, class _Up, class _Sco, __atomic_enable_if_native_bitwise<_Type> = 0>
+static inline _CCCL_DEVICE _Type __atomic_fetch_or_cuda(_Type volatile* __ptr, _Up __op, int __memorder, _Sco)
+{
+  using __proxy_t   = typename __atomic_cuda_deduce_bitwise<_Type>::__type;
+  using __proxy_tag = typename __atomic_cuda_deduce_bitwise<_Type>::__tag;
+  _Type __dst{};
+  __proxy_t* __ptr_proxy = reinterpret_cast<__proxy_t*>(const_cast<_Type*>(__ptr));
+  __proxy_t* __dst_proxy = reinterpret_cast<__proxy_t*>(&__dst);
+  __proxy_t* __op_proxy  = reinterpret_cast<__proxy_t*>(&__op);
+  if (__cuda_fetch_or_weak_if_local(__ptr_proxy, *__op_proxy, __dst_proxy))
+  {
+    return __dst;
+  }
+  return __atomic_fetch_or_nvvm_dispatch(__ptr_proxy, *__op_proxy, __memorder, __atomic_scope_tag_to_nvvm_scope(_Sco{}));
+}
+
+template <class _Type, class _Up, class _Sco, __atomic_enable_if_native_bitwise<_Type> = 0>
+static inline _CCCL_DEVICE _Type __atomic_fetch_xor_cuda(_Type* __ptr, _Up __op, int __memorder, _Sco)
+{
+  using __proxy_t   = typename __atomic_cuda_deduce_bitwise<_Type>::__type;
+  using __proxy_tag = typename __atomic_cuda_deduce_bitwise<_Type>::__tag;
+  _Type __dst{};
+  __proxy_t* __ptr_proxy = reinterpret_cast<__proxy_t*>(__ptr);
+  __proxy_t* __dst_proxy = reinterpret_cast<__proxy_t*>(&__dst);
+  __proxy_t* __op_proxy  = reinterpret_cast<__proxy_t*>(&__op);
+  if (__cuda_fetch_xor_weak_if_local(__ptr_proxy, *__op_proxy, __dst_proxy))
+  {
+    return __dst;
+  }
+  return __atomic_fetch_xor_nvvm_dispatch(
+    __ptr_proxy, *__op_proxy, __memorder, __atomic_scope_tag_to_nvvm_scope(_Sco{}));
+}
+template <class _Type, class _Up, class _Sco, __atomic_enable_if_native_bitwise<_Type> = 0>
+static inline _CCCL_DEVICE _Type __atomic_fetch_xor_cuda(_Type volatile* __ptr, _Up __op, int __memorder, _Sco)
+{
+  using __proxy_t   = typename __atomic_cuda_deduce_bitwise<_Type>::__type;
+  using __proxy_tag = typename __atomic_cuda_deduce_bitwise<_Type>::__tag;
+  _Type __dst{};
+  __proxy_t* __ptr_proxy = reinterpret_cast<__proxy_t*>(const_cast<_Type*>(__ptr));
+  __proxy_t* __dst_proxy = reinterpret_cast<__proxy_t*>(&__dst);
+  __proxy_t* __op_proxy  = reinterpret_cast<__proxy_t*>(&__op);
+  if (__cuda_fetch_xor_weak_if_local(__ptr_proxy, *__op_proxy, __dst_proxy))
+  {
+    return __dst;
+  }
+  return __atomic_fetch_xor_nvvm_dispatch(
+    __ptr_proxy, *__op_proxy, __memorder, __atomic_scope_tag_to_nvvm_scope(_Sco{}));
+}
+
+template <class _Type, class _Up, class _Sco>
+static inline _CCCL_DEVICE _Type __atomic_fetch_add_cuda(_Type* __ptr, _Up __op, int __memorder, _Sco)
+{
+  constexpr auto __skip_v = 1;
+  __op                    = __op * __skip_v;
+  using __proxy_t         = typename __atomic_cuda_deduce_bitwise<_Type>::__type;
+  using __proxy_tag       = typename __atomic_cuda_deduce_bitwise<_Type>::__tag;
+  _Type __dst{};
+  __proxy_t* __ptr_proxy = reinterpret_cast<__proxy_t*>(const_cast<_Type*>(__ptr));
+  __proxy_t* __dst_proxy = reinterpret_cast<__proxy_t*>(&__dst);
+  __proxy_t* __op_proxy  = reinterpret_cast<__proxy_t*>(&__op);
+  if (__cuda_fetch_add_weak_if_local(__ptr_proxy, *__op_proxy, __dst_proxy))
+  {
+    return __dst;
+  }
+  return __atomic_fetch_add_nvvm_dispatch(
+    __ptr_proxy, *__op_proxy, __memorder, __atomic_scope_tag_to_nvvm_scope(_Sco{}));
+}
+template <class _Type, class _Up, class _Sco>
+static inline _CCCL_DEVICE _Type __atomic_fetch_add_cuda(volatile _Type* __ptr, _Up __op, int __memorder, _Sco)
+{
+  constexpr auto __skip_v = 1;
+  __op                    = __op * __skip_v;
+  using __proxy_t         = typename __atomic_cuda_deduce_bitwise<_Type>::__type;
+  using __proxy_tag       = typename __atomic_cuda_deduce_bitwise<_Type>::__tag;
+  _Type __dst{};
+  __proxy_t* __ptr_proxy = reinterpret_cast<__proxy_t*>(const_cast<_Type*>(__ptr));
+  __proxy_t* __dst_proxy = reinterpret_cast<__proxy_t*>(&__dst);
+  __proxy_t* __op_proxy  = reinterpret_cast<__proxy_t*>(&__op);
+  if (__cuda_fetch_add_weak_if_local(__ptr_proxy, *__op_proxy, __dst_proxy))
+  {
+    return __dst;
+  }
+  return __atomic_fetch_add_nvvm_dispatch(
+    __ptr_proxy, *__op_proxy, __memorder, __atomic_scope_tag_to_nvvm_scope(_Sco{}));
+}
+
+template <class _Type, class _Up, class _Sco>
+static inline _CCCL_DEVICE _Type __atomic_fetch_sub_cuda(_Type* __ptr, _Up __op, int __memorder, _Sco)
+{
+  return __atomic_fetch_add_cuda(__ptr, -__op, __memorder, _Sco{});
+}
+template <class _Type, class _Up, class _Sco>
+static inline _CCCL_DEVICE _Type __atomic_fetch_sub_cuda(_Type volatile* __ptr, _Up __op, int __memorder, _Sco)
+{
+  return __atomic_fetch_add_cuda(__ptr, -__op, __memorder, _Sco{});
+}
+
+#  endif // _CCCL_HAS_CUDA_COMPILER()
+
+_CCCL_END_NAMESPACE_CUDA_STD
+
+#  include <cuda/std/__cccl/epilogue.h>
+
+#endif // _CUDA_STD___ATOMIC_BACKENDS_CUDA_NVVM_H
+
+// clang-format on
diff --git a/libcudacxx/include/cuda/std/__atomic/backends/cuda_nvvm_fallbacks.h b/libcudacxx/include/cuda/std/__atomic/backends/cuda_nvvm_fallbacks.h
new file mode 100644
index 00000000000..b6b0f9ef4fb
--- /dev/null
+++ b/libcudacxx/include/cuda/std/__atomic/backends/cuda_nvvm_fallbacks.h
@@ -0,0 +1,228 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of libcu++, the C++ Standard Library for your entire system,
+// under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+// SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef __CUDA_STD___ATOMIC_BACKENDS_DEVICE_FALLBACKS_H
+#define __CUDA_STD___ATOMIC_BACKENDS_DEVICE_FALLBACKS_H
+
+#include <cuda/std/detail/__config>
+
+#if defined(_CCCL_IMPLICIT_SYSTEM_HEADER_GCC)
+#  pragma GCC system_header
+#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_CLANG)
+#  pragma clang system_header
+#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_MSVC)
+#  pragma system_header
+#endif // no system header
+
+#include <cuda/std/__atomic/backends/cuda_supported_atomics_helper.h>
+#include <cuda/std/__type_traits/conditional.h>
+#include <cuda/std/__type_traits/enable_if.h>
+#include <cuda/std/__type_traits/is_scalar.h>
+#include <cuda/std/cstdint>
+
+#include <cuda/std/__cccl/prologue.h>
+
+_CCCL_BEGIN_NAMESPACE_CUDA_STD
+
+#if _CCCL_CUDA_COMPILATION()
+
+template <class _Type>
+struct __atomic_nvvm_dispatch_helper
+{
+  using __bitwise    = __atomic_cuda_deduce_bitwise<_Type>;
+  using __arithmetic = __atomic_cuda_deduce_arithmetic<_Type>;
+  using __minmax     = __atomic_cuda_deduce_minmax<_Type>;
+
+  static constexpr __atomic_nvvm_is_native_arithmetic =
+    /* fp16 and up */ ((_Operand::__size >= 16) && (_Operand::__op == __atomic_cuda_operand::_f)) ||
+    /* 32 bits and up */ ((_Operand::__size >= 32));
+
+  static constexpr __atomic_nvvm_is_native_arithmetic =
+    /* fp16 and up */ ((_Operand::__size >= 16) && (_Operand::__op == __atomic_cuda_operand::_f)) ||
+    /* 32 bits and up */ ((_Operand::__size >= 32));
+
+  static constexpr __atomic_nvvm_is_native_bitwise =
+    /* 32 bits and up */ ((__bitwise::__size >= 32));
+
+  static constexpr __atomic_nvvm_is_native_cas =
+    /* 16 bits and up */ ((__bitwise::__size >= 16));
+
+  // Native ld/st differs from PTX due to missing 8 bit constraints in inline PTX
+  static constexpr __atomic_nvvm_is_native_ld_st =
+    /* 8 bits and up */ ((__bitwise::__size >= 8));
+
+  using __enable_if_native_arithmetic     = enable_if_t<__atomic_nvvm_is_native_arithmetic, bool>;
+  using __enable_if_not_native_arithmetic = enable_if_t<!__atomic_nvvm_is_native_arithmetic, bool>;
+
+  using __enable_if_native_minmax     = enable_if_t<__atomic_nvvm_is_native_minmax, bool>;
+  using __enable_if_not_native_minmax = enable_if_t<!__atomic_nvvm_is_native_minmax, bool>;
+
+  using __enable_if_native_bitwise     = enable_if_t<__atomic_nvvm_is_native_bitwise, bool>;
+  using __enable_if_not_native_bitwise = enable_if_t<!__atomic_nvvm_is_native_bitwise, bool>;
+
+  using __enable_if_native_cas     = enable_if_t<__atomic_nvvm_is_native_cas, bool>;
+  using __enable_if_not_native_cas = enable_if_t<!__atomic_nvvm_is_native_cas, bool>;
+
+  using __enable_if_native_ld_st     = enable_if_t<__atomic_nvvm_is_native_ld_st, bool>;
+  using __enable_if_not_native_ld_st = enable_if_t<!__atomic_nvvm_is_native_ld_st, bool>;
+};
+
+template <class _Type, __atomic_nvvm_dispatch_helper<_Type>::__enable_if_not_native_ld_st = 0>
+_CCCL_ARTIFICIAL static inline _CCCL_DEVICE void
+__atomic_load_nvvm_dispatch(const _Type* __ptr, _Type* __dst, int __memorder, int __sco)
+{}
+
+template <class _Type, __atomic_nvvm_dispatch_helper<_Type>::__enable_if_not_native_ld_st = 0>
+_CCCL_ARTIFICIAL static inline _CCCL_DEVICE void
+__atomic_store_nvvm_dispatch(_Type* __ptr, _Type* __val, int __memorder, int __sco)
+{}
+
+template <class _Type, __atomic_nvvm_dispatch_helper<_Type>::__enable_if_not_native_cas = 0>
+_CCCL_ARTIFICIAL static inline _CCCL_DEVICE bool __atomic_compare_exchange_nvvm_dispatch(
+  _Type* __ptr, _Type* __exp, _Type* __des, bool __weak, int __success_memorder, int __failure_memorder, int __sco)
+{}
+
+template <class _Type, __atomic_nvvm_dispatch_helper<_Type>::__enable_if_not_native_cas = 0>
+_CCCL_ARTIFICIAL static inline _CCCL_DEVICE void
+__atomic_exchange_nvvm_dispatch(_Type* __atom, _Type* __val, _Type* __ret, int __memorder, int __sco)
+{}
+
+template <class _Type>
+_CCCL_ARTIFICIAL static inline _CCCL_DEVICE _Type
+__atomic_fetch_max_nvvm_dispatch(_Type* __ptr, _Type __op, int __memorder, int __sco)
+{}
+
+template <class _Type>
+_CCCL_ARTIFICIAL static inline _CCCL_DEVICE _Type
+__atomic_fetch_min_nvvm_dispatch(_Type* __ptr, _Type __op, int __memorder, int __sco)
+{}
+
+template <class _Type>
+_CCCL_ARTIFICIAL static inline _CCCL_DEVICE _Type
+__atomic_fetch_and_nvvm_dispatch(_Type* __ptr, _Type __op, int __memorder, int __sco)
+{}
+
+template <class _Type>
+_CCCL_ARTIFICIAL static inline _CCCL_DEVICE _Type
+__atomic_fetch_or_nvvm_dispatch(_Type* __ptr, _Type __op, int __memorder, int __sco)
+{}
+
+template <class _Type>
+_CCCL_ARTIFICIAL static inline _CCCL_DEVICE _Type
+__atomic_fetch_xor_nvvm_dispatch(_Type* __ptr, _Type __op, int __memorder, int __sco)
+{}
+
+template <class _Type>
+_CCCL_ARTIFICIAL static inline _CCCL_DEVICE _Type
+__atomic_fetch_add_nvvm_dispatch(_Type* __ptr, _Type __op, int __memorder, int __sco)
+{}
+
+template <class _Type, class _Order, class _Operand, class _Sco, __cuda_atomic_enable_non_native_ld_st<_Operand> = 0>
+static inline _CCCL_DEVICE void __atomic_load_nonnative(const _Type* __ptr, _Type& __dst, _Order, _Operand, _Sco)
+{
+  constexpr uint64_t __alignmask = (sizeof(uint16_t) - 1);
+  uint16_t* __aligned            = (uint16_t*) ((intptr_t) __ptr & (~__alignmask));
+  const uint8_t __offset         = uint16_t((intptr_t) __ptr & __alignmask) * 8;
+
+  uint16_t __value = 0;
+  __cuda_atomic_load(__aligned, __value, _Order{}, __atomic_cuda_operand_b16{}, _Sco{}, __atomic_cuda_mmio_disable{});
+
+  __dst = static_cast<_Type>(__value >> __offset);
+}
+
+template <class _Type, class _Order, class _Operand, class _Sco, __cuda_atomic_enable_non_native_bitwise<_Operand> = 0>
+static inline _CCCL_DEVICE bool
+__atomic_cas_nonnative(_Type* __ptr, _Type& __dst, _Type __cmp, _Type __op, _Order, _Operand, _Sco)
+{
+  constexpr uint64_t __alignmask = (sizeof(uint32_t) - 1);
+  constexpr uint32_t __sizemask  = (1 << (sizeof(_Type) * 8)) - 1;
+  uint32_t* __aligned            = (uint32_t*) ((intptr_t) __ptr & (~__alignmask));
+  const uint8_t __offset         = uint32_t((intptr_t) __ptr & __alignmask) * 8;
+  const uint32_t __valueMask     = __sizemask << __offset;
+  const uint32_t __windowMask    = ~__valueMask;
+  const uint32_t __cmpOffset     = __cmp << __offset;
+  const uint32_t __opOffset      = __op << __offset;
+
+  // Algorithm for 8b CAS with 32b intrinsics
+  // __old = __window[0:32] where [__cmp] resides within some offset.
+  uint32_t __old;
+  // Start by loading __old with the current value, this optimizes for early return when __cmp is wrong
+  NV_IF_TARGET(
+    NV_PROVIDES_SM_70,
+    (__cuda_atomic_load(
+       __aligned, __old, __atomic_cuda_relaxed{}, __atomic_cuda_operand_b32{}, _Sco{}, __atomic_cuda_mmio_disable{});),
+    (__cuda_atomic_load(
+       __aligned, __old, __atomic_cuda_volatile{}, __atomic_cuda_operand_b32{}, _Sco{}, __atomic_cuda_mmio_disable{});))
+  // Reemit CAS instructions until we succeed or the old value is a mismatch
+  while (__cmpOffset == (__old & __valueMask))
+  {
+    // Combine the desired value and most recently fetched expected masked portion of the window
+    const uint32_t __attempt = (__old & __windowMask) | __opOffset;
+
+    if (__cuda_atomic_compare_exchange(
+          __aligned, __old, __old, __attempt, _Order{}, __atomic_cuda_operand_b32{}, _Sco{}))
+    {
+      // CAS was successful
+      return true;
+    }
+  }
+  __dst = static_cast<_Type>(__old >> __offset);
+  return false;
+}
+
+// Optimized fetch_update CAS loop with op determined after first load reducing waste.
+template <class _Type,
+          class _Fn,
+          class _Order,
+          class _Operand,
+          class _Sco,
+          __cuda_atomic_enable_non_native_bitwise<_Operand> = 0>
+_CCCL_DEVICE _Type __atomic_fetch_update_nonnative(_Type* __ptr, const _Fn& __op, _Order, _Operand, _Sco)
+{
+  constexpr uint64_t __alignmask = (sizeof(uint32_t) - 1);
+  constexpr uint32_t __sizemask  = (1 << (sizeof(_Type) * 8)) - 1;
+  uint32_t* __aligned            = (uint32_t*) ((intptr_t) __ptr & (~__alignmask));
+  const uint8_t __offset         = uint8_t((intptr_t) __ptr & __alignmask) * 8;
+  const uint32_t __valueMask     = __sizemask << __offset;
+  const uint32_t __windowMask    = ~__valueMask;
+
+  // 8/16b fetch update is similar to CAS implementation, but compresses the logic for recalculating the operand
+  // __old = __window[0:32] where [__cmp] resides within some offset.
+  uint32_t __old;
+  NV_IF_TARGET(
+    NV_PROVIDES_SM_70,
+    (__cuda_atomic_load(
+       __aligned, __old, __atomic_cuda_relaxed{}, __atomic_cuda_operand_b32{}, _Sco{}, __atomic_cuda_mmio_disable{});),
+    (__cuda_atomic_load(
+       __aligned, __old, __atomic_cuda_volatile{}, __atomic_cuda_operand_b32{}, _Sco{}, __atomic_cuda_mmio_disable{});))
+
+  // Reemit CAS instructions until we succeed
+  while (1)
+  {
+    // Calculate new desired value from last fetched __old
+    // Use of the value mask is required due to the possibility of overflow when ops are widened. Possible compiler bug?
+    const uint32_t __attempt =
+      ((static_cast<uint32_t>(__op(static_cast<_Type>(__old >> __offset))) << __offset) & __valueMask)
+      | (__old & __windowMask);
+
+    if (__cuda_atomic_compare_exchange(
+          __aligned, __old, __old, __attempt, _Order{}, __atomic_cuda_operand_b32{}, _Sco{}))
+    {
+      // CAS was successful
+      return static_cast<_Type>(__old >> __offset);
+    }
+  }
+}
+
+#endif // ^_CCCL_CUDA_COMPILATION()
+
+_CCCL_END_NAMESPACE_CUDA_STD
+
+#endif // __CUDA_STD___ATOMIC_BACKENDS_DEVICE_FALLBACKS_H
diff --git a/libcudacxx/include/cuda/std/__atomic/backends/cuda_nvvm_wrapped.h b/libcudacxx/include/cuda/std/__atomic/backends/cuda_nvvm_wrapped.h
new file mode 100644
index 00000000000..451afc7121a
--- /dev/null
+++ b/libcudacxx/include/cuda/std/__atomic/backends/cuda_nvvm_wrapped.h
@@ -0,0 +1,268 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of libcu++, the C++ Standard Library for your entire system,
+// under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+// SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef _CUDA_STD___ATOMIC_BACKENDS_CUDA_NVVM_WRAPPED_H
+#define _CUDA_STD___ATOMIC_BACKENDS_CUDA_NVVM_WRAPPED_H
+
+#include <cuda/std/detail/__config>
+
+#if defined(_CCCL_IMPLICIT_SYSTEM_HEADER_GCC)
+#  pragma GCC system_header
+#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_CLANG)
+#  pragma clang system_header
+#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_MSVC)
+#  pragma system_header
+#endif // no system header
+
+#include <cuda/std/__atomic/backends/cuda_nvvm_fallbacks.h>
+#include <cuda/std/__atomic/order.h>
+#include <cuda/std/__atomic/scopes.h>
+
+#include <cuda/std/__cccl/prologue.h>
+
+_CCCL_BEGIN_NAMESPACE_CUDA_STD
+
+#if _CCCL_CUDA_COMPILATION()
+
+#  define __ATOMIC_NVVM_WRAP(...)    (__VA_ARGS__)
+#  define __ATOMIC_NVVM_UNWRAP1(...) __VA_ARGS__
+#  define __ATOMIC_NVVM_UNWRAP(...)  __ATOMIC_NVVM_UNWRAP1 __VA_ARGS__
+
+#  define __ATOMIC_SWITCH(val, ...) \
+    switch (val)                    \
+    {                               \
+      __VA_ARGS__                   \
+    }
+
+#  define __ATOMIC_CASE(test, fn, ...) \
+    case test:                         \
+      _CCCL_PP_OBSTRUCT(fn)(test, ##__VA_ARGS__) break;
+
+#  define __ATOMIC_SCOPE_CASES_SM90(...)                              \
+    /* THREAD */ __ATOMIC_CASE(__NV_THREAD_SCOPE_THREAD, __VA_ARGS__) \
+    /* BLOCK */ __ATOMIC_CASE(__NV_THREAD_SCOPE_BLOCK, __VA_ARGS__)   \
+    /* DEVICE */ __ATOMIC_CASE(__NV_THREAD_SCOPE_DEVICE, __VA_ARGS__) \
+    /* SYSTEM */ __ATOMIC_CASE(__NV_THREAD_SCOPE_SYSTEM, __VA_ARGS__) \
+    /* CLUSTER */ __ATOMIC_CASE(__NV_THREAD_SCOPE_CLUSTER, __VA_ARGS__)
+
+#  define __ATOMIC_SCOPE_CASES(...)                                   \
+    /* THREAD */ __ATOMIC_CASE(__NV_THREAD_SCOPE_THREAD, __VA_ARGS__) \
+    /* BLOCK */ __ATOMIC_CASE(__NV_THREAD_SCOPE_BLOCK, __VA_ARGS__)   \
+    /* DEVICE */ __ATOMIC_CASE(__NV_THREAD_SCOPE_DEVICE, __VA_ARGS__) \
+    /* SYSTEM */ __ATOMIC_CASE(__NV_THREAD_SCOPE_SYSTEM, __VA_ARGS__)
+
+#  define __ATOMIC_ALL_ORDER_CASES(...)                           \
+    /* RELAXED */ __ATOMIC_CASE(__NV_ATOMIC_RELAXED, __VA_ARGS__) \
+    /* CONSUME */ __ATOMIC_CASE(__NV_ATOMIC_CONSUME, __VA_ARGS__) \
+    /* ACQUIRE */ __ATOMIC_CASE(__NV_ATOMIC_ACQUIRE, __VA_ARGS__) \
+    /* RELEASE */ __ATOMIC_CASE(__NV_ATOMIC_RELEASE, __VA_ARGS__) \
+    /* ACQ_REL */ __ATOMIC_CASE(__NV_ATOMIC_ACQ_REL, __VA_ARGS__) \
+    /* SEQ_CST */ __ATOMIC_CASE(__NV_ATOMIC_SEQ_CST, __VA_ARGS__)
+
+#  define __ATOMIC_READ_CASES(...)                                \
+    /* RELAXED */ __ATOMIC_CASE(__NV_ATOMIC_RELAXED, __VA_ARGS__) \
+    /* CONSUME */ __ATOMIC_CASE(__NV_ATOMIC_CONSUME, __VA_ARGS__) \
+    /* ACQUIRE */ __ATOMIC_CASE(__NV_ATOMIC_ACQUIRE, __VA_ARGS__) \
+    /* SEQ_CST */ __ATOMIC_CASE(__NV_ATOMIC_SEQ_CST, __VA_ARGS__)
+
+#  define __ATOMIC_WRITE_CASES(...)                               \
+    /* RELAXED */ __ATOMIC_CASE(__NV_ATOMIC_RELAXED, __VA_ARGS__) \
+    /* RELEASE */ __ATOMIC_CASE(__NV_ATOMIC_RELEASE, __VA_ARGS__) \
+    /* SEQ_CST */ __ATOMIC_CASE(__NV_ATOMIC_SEQ_CST, __VA_ARGS__)
+
+#  define __ATOMIC_FENCE_CASES(...)    __ATOMIC_ALL_ORDER_CASES(__VA_ARGS__)
+#  define __ATOMIC_EXCHANGE_CASES(...) __ATOMIC_ALL_ORDER_CASES(__VA_ARGS__)
+#  define __ATOMIC_FETCH_OP_CASES(...) __ATOMIC_ALL_ORDER_CASES(__VA_ARGS__)
+
+#  define __ATOMIC_COMPARE_SUCCESS_CASES(...) __ATOMIC_ALL_ORDER_CASES(__VA_ARGS__)
+#  define __ATOMIC_COMPARE_FAILURE_CASES(...) __ATOMIC_READ_CASES(__VA_ARGS__)
+
+#  define __ATOMIC_SCOPES_SWITCH(scope, scopes, ...) __ATOMIC_SWITCH(scope, scopes(__VA_ARGS__))
+#  define __ATOMIC_ORDER_SWITCH(order, orders, ...)  __ATOMIC_SWITCH(order, orders(__VA_ARGS__))
+
+#  define __ATOMIC_NVVM_BUILTIN2(_scope, intrinsic, ...) intrinsic(__ATOMIC_NVVM_UNWRAP(__VA_ARGS__), _scope);
+#  define __ATOMIC_NVVM_BUILTIN1(_order, intrinsic, scope, scopes, ...) \
+    __ATOMIC_SCOPES_SWITCH(                                             \
+      scope, scopes, __ATOMIC_NVVM_BUILTIN2, intrinsic, __ATOMIC_NVVM_WRAP(__ATOMIC_NVVM_UNWRAP(__VA_ARGS__), _order))
+#  define __ATOMIC_NVVM_BUILTIN0(_order, intrinsic, order, orders, scope, scopes, ...) \
+    __ATOMIC_ORDER_SWITCH(                                                             \
+      order,                                                                           \
+      orders,                                                                          \
+      __ATOMIC_NVVM_BUILTIN1,                                                          \
+      intrinsic,                                                                       \
+      scope,                                                                           \
+      scopes,                                                                          \
+      __ATOMIC_NVVM_WRAP(__ATOMIC_NVVM_UNWRAP(__VA_ARGS__), _order))
+
+// An attempted explanation:
+// We pass down macro function names and arguments through functions that create switch statements, the cases expand
+// them by eventually invoking the passed in `__ATOMIC_NVVM_BUILTIN#` with the now concrete case value selected in the
+// switch - This then calls another switch builder, uses another macro function, and expands again. Arguments to the
+// function are packed inside of `()` by __ATOMIC_NVVM_WRAP/UNWRAP in order to prevent any accidental escape.
+#  define __ATOMIC_NVVM_BUILTIN(intrinsic, order, orders, scope, ...) \
+    NV_IF_ELSE_TARGET(                                                \
+      NV_PROVIDES_SM_90,                                              \
+      ({__ATOMIC_ORDER_SWITCH(                                        \
+        order,                                                        \
+        orders,                                                       \
+        __ATOMIC_NVVM_BUILTIN1,                                       \
+        intrinsic,                                                    \
+        scope,                                                        \
+        __ATOMIC_SCOPE_CASES_SM90,                                    \
+        __ATOMIC_NVVM_WRAP(__VA_ARGS__))}),                           \
+      ({__ATOMIC_ORDER_SWITCH(                                        \
+        order,                                                        \
+        orders,                                                       \
+        __ATOMIC_NVVM_BUILTIN1,                                       \
+        intrinsic,                                                    \
+        scope,                                                        \
+        __ATOMIC_SCOPE_CASES,                                         \
+        __ATOMIC_NVVM_WRAP(__VA_ARGS__))}))
+
+// __ATOMIC_NVVM_BUILTIN_SF selects three times for compare_exchange
+#  define __ATOMIC_NVVM_BUILTIN_SF(intrinsic, success, sorders, failure, forders, scope, ...) \
+    NV_IF_ELSE_TARGET(                                                                        \
+      NV_PROVIDES_SM_90,                                                                      \
+      ({__ATOMIC_ORDER_SWITCH(                                                                \
+        success,                                                                              \
+        sorders,                                                                              \
+        __ATOMIC_NVVM_BUILTIN0,                                                               \
+        intrinsic,                                                                            \
+        failure,                                                                              \
+        forders,                                                                              \
+        scope,                                                                                \
+        __ATOMIC_SCOPE_CASES_SM90,                                                            \
+        __ATOMIC_NVVM_WRAP(__VA_ARGS__))}),                                                   \
+      ({__ATOMIC_ORDER_SWITCH(                                                                \
+        success,                                                                              \
+        sorders,                                                                              \
+        __ATOMIC_NVVM_BUILTIN0,                                                               \
+        intrinsic,                                                                            \
+        failure,                                                                              \
+        forders,                                                                              \
+        scope,                                                                                \
+        __ATOMIC_SCOPE_CASES,                                                                 \
+        __ATOMIC_NVVM_WRAP(__VA_ARGS__))}))
+
+template <class _Type>
+_CCCL_ARTIFICIAL static inline _CCCL_DEVICE void
+__atomic_thread_fence_nvvm_dispatch(const _Type* __ptr, _Type* __dst, int __memorder, int __sco)
+{
+  _CCCL_PP_EXPAND(__ATOMIC_NVVM_BUILTIN(__nv_atomic_thread_fence, __memorder, __ATOMIC_FENCE_CASES, __sco));
+}
+
+template <class _Type>
+_CCCL_ARTIFICIAL static inline _CCCL_DEVICE void
+__atomic_load_nvvm_dispatch(const _Type* __ptr, _Type* __dst, int __memorder, int __sco)
+{
+  _CCCL_PP_EXPAND(__ATOMIC_NVVM_BUILTIN(__nv_atomic_load, __memorder, __ATOMIC_READ_CASES, __sco, __ptr, __dst));
+}
+
+template <class _Type>
+_CCCL_ARTIFICIAL static inline _CCCL_DEVICE void
+__atomic_store_nvvm_dispatch(_Type* __ptr, _Type* __val, int __memorder, int __sco)
+{
+  _CCCL_PP_EXPAND(__ATOMIC_NVVM_BUILTIN(__nv_atomic_store, __memorder, __ATOMIC_WRITE_CASES, __sco, __ptr, __val));
+}
+
+template <class _Type>
+_CCCL_ARTIFICIAL static inline _CCCL_DEVICE bool __atomic_compare_exchange_nvvm_dispatch(
+  _Type* __ptr, _Type* __exp, _Type* __des, bool __weak, int __success_memorder, int __failure_memorder, int __sco)
+{
+  _CCCL_PP_EXPAND(__ATOMIC_NVVM_BUILTIN_SF(
+    return __nv_atomic_compare_exchange,
+           __success_memorder,
+           __ATOMIC_COMPARE_SUCCESS_CASES,
+           __failure_memorder,
+           __ATOMIC_COMPARE_FAILURE_CASES,
+           __sco,
+           __ptr,
+           __exp,
+           __des,
+           __weak));
+  _CCCL_UNREACHABLE();
+  return {};
+}
+
+template <class _Type>
+_CCCL_ARTIFICIAL static inline _CCCL_DEVICE void
+__atomic_exchange_nvvm_dispatch(_Type* __atom, _Type* __val, _Type* __ret, int __memorder, int __sco)
+{
+  _CCCL_PP_EXPAND(
+    __ATOMIC_NVVM_BUILTIN(__nv_atomic_exchange, __memorder, __ATOMIC_EXCHANGE_CASES, __sco, __atom, __val, __ret));
+}
+
+template <class _Type>
+_CCCL_ARTIFICIAL static inline _CCCL_DEVICE _Type
+__atomic_fetch_max_nvvm_dispatch(_Type* __ptr, _Type __op, int __memorder, int __sco)
+{
+  _CCCL_PP_EXPAND(
+    __ATOMIC_NVVM_BUILTIN(return __nv_atomic_fetch_max, __memorder, __ATOMIC_EXCHANGE_CASES, __sco, __ptr, __op));
+  _CCCL_UNREACHABLE();
+  return {};
+}
+
+template <class _Type>
+_CCCL_ARTIFICIAL static inline _CCCL_DEVICE _Type
+__atomic_fetch_min_nvvm_dispatch(_Type* __ptr, _Type __op, int __memorder, int __sco)
+{
+  _CCCL_PP_EXPAND(
+    __ATOMIC_NVVM_BUILTIN(return __nv_atomic_fetch_min, __memorder, __ATOMIC_EXCHANGE_CASES, __sco, __ptr, __op));
+  _CCCL_UNREACHABLE();
+  return {};
+}
+
+template <class _Type>
+_CCCL_ARTIFICIAL static inline _CCCL_DEVICE _Type
+__atomic_fetch_and_nvvm_dispatch(_Type* __ptr, _Type __op, int __memorder, int __sco)
+{
+  _CCCL_PP_EXPAND(
+    __ATOMIC_NVVM_BUILTIN(return __nv_atomic_fetch_and, __memorder, __ATOMIC_EXCHANGE_CASES, __sco, __ptr, __op));
+  _CCCL_UNREACHABLE();
+  return {};
+}
+
+template <class _Type>
+_CCCL_ARTIFICIAL static inline _CCCL_DEVICE _Type
+__atomic_fetch_or_nvvm_dispatch(_Type* __ptr, _Type __op, int __memorder, int __sco)
+{
+  _CCCL_PP_EXPAND(
+    __ATOMIC_NVVM_BUILTIN(return __nv_atomic_fetch_or, __memorder, __ATOMIC_EXCHANGE_CASES, __sco, __ptr, __op));
+  _CCCL_UNREACHABLE();
+  return {};
+}
+
+template <class _Type>
+_CCCL_ARTIFICIAL static inline _CCCL_DEVICE _Type
+__atomic_fetch_xor_nvvm_dispatch(_Type* __ptr, _Type __op, int __memorder, int __sco)
+{
+  _CCCL_PP_EXPAND(
+    __ATOMIC_NVVM_BUILTIN(return __nv_atomic_fetch_xor, __memorder, __ATOMIC_EXCHANGE_CASES, __sco, __ptr, __op));
+  _CCCL_UNREACHABLE();
+  return {};
+}
+
+template <class _Type>
+_CCCL_ARTIFICIAL static inline _CCCL_DEVICE _Type
+__atomic_fetch_add_nvvm_dispatch(_Type* __ptr, _Type __op, int __memorder, int __sco)
+{
+  _CCCL_PP_EXPAND(
+    __ATOMIC_NVVM_BUILTIN(return __nv_atomic_fetch_add, __memorder, __ATOMIC_EXCHANGE_CASES, __sco, __ptr, __op));
+  _CCCL_UNREACHABLE();
+  return {};
+}
+
+#endif // _CCCL_CUDA_COMPILATION
+
+_CCCL_END_NAMESPACE_CUDA_STD
+
+#include <cuda/std/__cccl/epilogue.h>
+
+#endif // _CUDA_STD___ATOMIC_BACKENDS_CUDA_NVVM_WRAPPED_H
diff --git a/libcudacxx/include/cuda/std/__atomic/functions/cuda_ptx_derived.h b/libcudacxx/include/cuda/std/__atomic/backends/cuda_ptx_derived.h
similarity index 98%
rename from libcudacxx/include/cuda/std/__atomic/functions/cuda_ptx_derived.h
rename to libcudacxx/include/cuda/std/__atomic/backends/cuda_ptx_derived.h
index 7ebca48711f..56e17c520d6 100644
--- a/libcudacxx/include/cuda/std/__atomic/functions/cuda_ptx_derived.h
+++ b/libcudacxx/include/cuda/std/__atomic/backends/cuda_ptx_derived.h
@@ -8,8 +8,8 @@
 //
 //===----------------------------------------------------------------------===//
 
-#ifndef __CUDA_STD___ATOMIC_FUNCTIONS_DERIVED_H
-#define __CUDA_STD___ATOMIC_FUNCTIONS_DERIVED_H
+#ifndef __CUDA_STD___ATOMIC_BACKENDS_DERIVED_H
+#define __CUDA_STD___ATOMIC_BACKENDS_DERIVED_H
 
 #include <cuda/std/detail/__config>
 
@@ -21,7 +21,7 @@
 #  pragma system_header
 #endif // no system header
 
-#include <cuda/std/__atomic/functions/cuda_ptx_generated.h>
+#include <cuda/std/__atomic/backends/cuda_supported_atomics_helper.h>
 #include <cuda/std/__functional/operations.h>
 #include <cuda/std/__type_traits/conditional.h>
 #include <cuda/std/__type_traits/enable_if.h>
@@ -448,4 +448,4 @@ _CCCL_END_NAMESPACE_CUDA_STD
 
 #include <cuda/std/__cccl/epilogue.h>
 
-#endif // __CUDA_STD___ATOMIC_FUNCTIONS_DERIVED_H
+#endif // __CUDA_STD___ATOMIC_BACKENDS_DERIVED_H
diff --git a/libcudacxx/include/cuda/std/__atomic/functions/cuda_ptx_generated.h b/libcudacxx/include/cuda/std/__atomic/backends/cuda_ptx_generated.h
similarity index 99%
rename from libcudacxx/include/cuda/std/__atomic/functions/cuda_ptx_generated.h
rename to libcudacxx/include/cuda/std/__atomic/backends/cuda_ptx_generated.h
index f3e30d53039..4f701866828 100644
--- a/libcudacxx/include/cuda/std/__atomic/functions/cuda_ptx_generated.h
+++ b/libcudacxx/include/cuda/std/__atomic/backends/cuda_ptx_generated.h
@@ -11,8 +11,8 @@
 // This is an autogenerated file, we want to ensure that it contains exactly the contents we want to generate
 // clang-format off
 
-#ifndef _CUDA_STD___ATOMIC_FUNCTIONS_CUDA_PTX_GENERATED_H
-#define _CUDA_STD___ATOMIC_FUNCTIONS_CUDA_PTX_GENERATED_H
+#ifndef _CUDA_STD___ATOMIC_BACKENDS_CUDA_PTX_GENERATED_H
+#define _CUDA_STD___ATOMIC_BACKENDS_CUDA_PTX_GENERATED_H
 
 #include <cuda/std/detail/__config>
 
@@ -33,9 +33,9 @@
 
 #include <cuda/std/__atomic/scopes.h>
 #include <cuda/std/__atomic/order.h>
-#include <cuda/std/__atomic/functions/common.h>
-#include <cuda/std/__atomic/functions/cuda_ptx_generated_helper.h>
-#include <cuda/std/__atomic/functions/cuda_local.h>
+#include <cuda/std/__atomic/backends/common.h>
+#include <cuda/std/__atomic/backends/cuda_supported_atomics_helper.h>
+#include <cuda/std/__atomic/backends/cuda_local.h>
 
 #include <cuda/std/__cccl/prologue.h>
 
@@ -1371,7 +1371,7 @@ static inline _CCCL_DEVICE void __atomic_store_cuda(_Type* __ptr, _Type& __val,
   __cuda_atomic_store_memory_order_dispatch(__bound_store, __memorder, _Sco{});
 }
 template <class _Type, class _Sco>
-static inline _CCCL_DEVICE void __atomic_store_cuda(volatile _Type* __ptr, _Type& __val, int __memorder, _Sco)
+static inline _CCCL_DEVICE void __atomic_store_cuda(_Type volatile* __ptr, _Type& __val, int __memorder, _Sco)
 {
   using __proxy_t        = typename __atomic_cuda_deduce_bitwise<_Type>::__type;
   using __proxy_tag      = typename __atomic_cuda_deduce_bitwise<_Type>::__tag;
@@ -4431,6 +4431,6 @@ _CCCL_END_NAMESPACE_CUDA_STD
 
 #include <cuda/std/__cccl/epilogue.h>
 
-#endif // _CUDA_STD___ATOMIC_FUNCTIONS_CUDA_PTX_GENERATED_H
+#endif // _CUDA_STD___ATOMIC_BACKENDS_CUDA_PTX_GENERATED_H
 
 // clang-format on
diff --git a/libcudacxx/include/cuda/std/__atomic/functions/cuda_ptx_generated_helper.h b/libcudacxx/include/cuda/std/__atomic/backends/cuda_supported_atomics_helper.h
similarity index 97%
rename from libcudacxx/include/cuda/std/__atomic/functions/cuda_ptx_generated_helper.h
rename to libcudacxx/include/cuda/std/__atomic/backends/cuda_supported_atomics_helper.h
index cd221248b99..e462f09512e 100644
--- a/libcudacxx/include/cuda/std/__atomic/functions/cuda_ptx_generated_helper.h
+++ b/libcudacxx/include/cuda/std/__atomic/backends/cuda_supported_atomics_helper.h
@@ -8,8 +8,8 @@
 //
 //===----------------------------------------------------------------------===//
 
-#ifndef _CUDA_STD___ATOMIC_FUNCTIONS_CUDA_PTX_GENERATED_HELPER_H
-#define _CUDA_STD___ATOMIC_FUNCTIONS_CUDA_PTX_GENERATED_HELPER_H
+#ifndef _CUDA_STD___ATOMIC_BACKENDS_CUDA_SUPPORTED_ATOMICS_HELPER_H
+#define _CUDA_STD___ATOMIC_BACKENDS_CUDA_SUPPORTED_ATOMICS_HELPER_H
 
 #include <cuda/std/detail/__config>
 
@@ -181,4 +181,4 @@ _CCCL_END_NAMESPACE_CUDA_STD
 
 #include <cuda/std/__cccl/epilogue.h>
 
-#endif // _CUDA_STD___ATOMIC_FUNCTIONS_CUDA_PTX_GENERATED_H
+#endif // _CUDA_STD___ATOMIC_BACKENDS_CUDA_SUPPORTED_ATOMICS_HELPER_H
diff --git a/libcudacxx/include/cuda/std/__atomic/functions/host.h b/libcudacxx/include/cuda/std/__atomic/backends/host.h
similarity index 97%
rename from libcudacxx/include/cuda/std/__atomic/functions/host.h
rename to libcudacxx/include/cuda/std/__atomic/backends/host.h
index 0a20a333ff8..48350d6ce57 100644
--- a/libcudacxx/include/cuda/std/__atomic/functions/host.h
+++ b/libcudacxx/include/cuda/std/__atomic/backends/host.h
@@ -8,8 +8,8 @@
 //
 //===----------------------------------------------------------------------===//
 
-#ifndef _CUDA_STD___ATOMICS_FUNCTIONS_HOST_H
-#define _CUDA_STD___ATOMICS_FUNCTIONS_HOST_H
+#ifndef _CUDA_STD___ATOMIC_BACKENDS_HOST_H
+#define _CUDA_STD___ATOMIC_BACKENDS_HOST_H
 
 #include <cuda/std/detail/__config>
 
@@ -21,7 +21,7 @@
 #  pragma system_header
 #endif // no system header
 
-#include <cuda/std/__atomic/functions/common.h>
+#include <cuda/std/__atomic/backends/common.h>
 #include <cuda/std/__atomic/order.h>
 #include <cuda/std/__atomic/platform.h>
 #include <cuda/std/__type_traits/enable_if.h>
@@ -239,4 +239,4 @@ _CCCL_END_NAMESPACE_CUDA_STD
 
 #include <cuda/std/__cccl/epilogue.h>
 
-#endif // _CUDA_STD___ATOMICS_FUNCTIONS_HOST_H
+#endif // _CUDA_STD___ATOMIC_BACKENDS_HOST_H
diff --git a/libcudacxx/include/cuda/std/__atomic/order.h b/libcudacxx/include/cuda/std/__atomic/order.h
index 6efe67f10a6..9aa8e6060c9 100644
--- a/libcudacxx/include/cuda/std/__atomic/order.h
+++ b/libcudacxx/include/cuda/std/__atomic/order.h
@@ -109,30 +109,43 @@ _CCCL_API inline int __stronger_order_cuda(int __a, int __b)
 
 _CCCL_API constexpr int __atomic_order_to_int(memory_order __order)
 {
-  // Avoid switch statement to make this a constexpr.
-  return __order == memory_order_relaxed
-         ? __ATOMIC_RELAXED
-         : (__order == memory_order_acquire
-              ? __ATOMIC_ACQUIRE
-              : (__order == memory_order_release
-                   ? __ATOMIC_RELEASE
-                   : (__order == memory_order_seq_cst
-                        ? __ATOMIC_SEQ_CST
-                        : (__order == memory_order_acq_rel ? __ATOMIC_ACQ_REL : __ATOMIC_CONSUME))));
+  switch (__order)
+  {
+    default:
+      return __ATOMIC_CONSUME;
+    case memory_order_relaxed:
+      return __ATOMIC_RELAXED;
+    case memory_order_acquire:
+      return __ATOMIC_ACQUIRE;
+    case memory_order_release:
+      return __ATOMIC_RELEASE;
+    case memory_order_seq_cst:
+      return __ATOMIC_SEQ_CST;
+    case memory_order_acq_rel:
+      return __ATOMIC_ACQ_REL;
+  }
 }
 
 _CCCL_API constexpr int __atomic_failure_order_to_int(memory_order __order)
 {
-  // Avoid switch statement to make this a constexpr.
-  return __order == memory_order_relaxed
-         ? __ATOMIC_RELAXED
-         : (__order == memory_order_acquire
-              ? __ATOMIC_ACQUIRE
-              : (__order == memory_order_release
-                   ? __ATOMIC_RELAXED
-                   : (__order == memory_order_seq_cst
-                        ? __ATOMIC_SEQ_CST
-                        : (__order == memory_order_acq_rel ? __ATOMIC_ACQUIRE : __ATOMIC_CONSUME))));
+  // Note:
+  // release -> relaxed
+  // acq_rel -> acquire
+  switch (__order)
+  {
+    default:
+      return __ATOMIC_CONSUME;
+    case memory_order_relaxed:
+      return __ATOMIC_RELAXED;
+    case memory_order_acquire:
+      return __ATOMIC_ACQUIRE;
+    case memory_order_release:
+      return __ATOMIC_RELAXED;
+    case memory_order_seq_cst:
+      return __ATOMIC_SEQ_CST;
+    case memory_order_acq_rel:
+      return __ATOMIC_ACQUIRE;
+  }
 }
 
 static_assert((is_same_v<underlying_type<memory_order>::type, __memory_order_underlying_t>),
diff --git a/libcudacxx/include/cuda/std/__atomic/types/base.h b/libcudacxx/include/cuda/std/__atomic/types/base.h
index aacd8453c8a..cd8d2cb4983 100644
--- a/libcudacxx/include/cuda/std/__atomic/types/base.h
+++ b/libcudacxx/include/cuda/std/__atomic/types/base.h
@@ -21,7 +21,8 @@
 #  pragma system_header
 #endif // no system header
 
-#include <cuda/std/__atomic/functions.h>
+#include <cuda/std/__atomic/backends.h>
+#include <cuda/std/__atomic/order.h>
 #include <cuda/std/__atomic/types/common.h>
 #include <cuda/std/__type_traits/is_trivially_copyable.h>
 
@@ -69,7 +70,7 @@ _CCCL_API inline void __atomic_thread_fence_dispatch(memory_order __order)
 {
   NV_DISPATCH_TARGET(
     NV_IS_DEVICE,
-    (__atomic_thread_fence_cuda(static_cast<__memory_order_underlying_t>(__order), __thread_scope_system_tag());),
+    (__atomic_thread_fence_cuda(::cuda::std::__atomic_order_to_int(__order), __thread_scope_system_tag());),
     NV_IS_HOST,
     (__atomic_thread_fence_host(__order);))
 }
@@ -77,7 +78,7 @@ _CCCL_API inline void __atomic_thread_fence_dispatch(memory_order __order)
 _CCCL_API inline void __atomic_signal_fence_dispatch(memory_order __order)
 {
   NV_DISPATCH_TARGET(NV_IS_DEVICE,
-                     (__atomic_signal_fence_cuda(static_cast<__memory_order_underlying_t>(__order));),
+                     (__atomic_signal_fence_cuda(::cuda::std::__atomic_order_to_int(__order));),
                      NV_IS_HOST,
                      (__atomic_signal_fence_host(__order);))
 }
@@ -91,21 +92,20 @@ _CCCL_API void __atomic_init_dispatch(_Sto* __a, _Up __val)
 template <typename _Sto, typename _Up, typename _Sco, __atomic_storage_is_base<_Sto> = 0>
 _CCCL_API void __atomic_store_dispatch(_Sto* __a, _Up __val, memory_order __order, _Sco = {})
 {
-  NV_DISPATCH_TARGET(
-    NV_IS_DEVICE,
-    (__atomic_store_n_cuda(__a->get(), __val, static_cast<__memory_order_underlying_t>(__order), _Sco{});),
-    NV_IS_HOST,
-    (__atomic_store_host(__a->get(), __val, __order);))
+  NV_DISPATCH_TARGET(NV_IS_DEVICE,
+                     (__atomic_store_n_cuda(__a->get(), __val, ::cuda::std::__atomic_order_to_int(__order), _Sco{});),
+                     NV_IS_HOST,
+                     (__atomic_store_host(__a->get(), __val, __order);))
 }
 
 template <typename _Sto, typename _Sco, __atomic_storage_is_base<_Sto> = 0>
 _CCCL_API auto __atomic_load_dispatch(const _Sto* __a, memory_order __order, _Sco = {}) -> __atomic_underlying_t<_Sto>
 {
-  NV_DISPATCH_TARGET(
-    NV_IS_DEVICE,
-    (return __atomic_load_n_cuda(__a->get(), static_cast<__memory_order_underlying_t>(__order), _Sco{});),
-    NV_IS_HOST,
-    (return __atomic_load_host(__a->get(), __order);))
+  NV_DISPATCH_TARGET(NV_IS_DEVICE,
+                     (return __atomic_load_n_cuda(__a->get(), ::cuda::std::__atomic_order_to_int(__order), _Sco{});),
+                     NV_IS_HOST,
+                     (return __atomic_load_host(__a->get(), __order);))
+  _CCCL_UNREACHABLE();
 }
 
 template <typename _Sto, typename _Up, typename _Sco, __atomic_storage_is_base<_Sto> = 0>
@@ -114,7 +114,7 @@ _CCCL_API auto __atomic_exchange_dispatch(_Sto* __a, _Up __value, memory_order _
 {
   NV_DISPATCH_TARGET(
     NV_IS_DEVICE,
-    (return __atomic_exchange_n_cuda(__a->get(), __value, static_cast<__memory_order_underlying_t>(__order), _Sco{});),
+    (return __atomic_exchange_n_cuda(__a->get(), __value, ::cuda::std::__atomic_order_to_int(__order), _Sco{});),
     NV_IS_HOST,
     (return __atomic_exchange_host(__a->get(), __value, __order);))
 }
@@ -131,8 +131,8 @@ _CCCL_API bool __atomic_compare_exchange_strong_dispatch(
        __expected,
        __val,
        false,
-       static_cast<__memory_order_underlying_t>(__success),
-       static_cast<__memory_order_underlying_t>(__failure),
+       ::cuda::std::__atomic_order_to_int(__success),
+       ::cuda::std::__atomic_order_to_int(__failure),
        _Sco{});),
     NV_IS_HOST,
     (__result = __atomic_compare_exchange_strong_host(__a->get(), __expected, __val, __success, __failure);))
@@ -151,8 +151,8 @@ _CCCL_API bool __atomic_compare_exchange_weak_dispatch(
        __expected,
        __val,
        true,
-       static_cast<__memory_order_underlying_t>(__success),
-       static_cast<__memory_order_underlying_t>(__failure),
+       ::cuda::std::__atomic_order_to_int(__success),
+       ::cuda::std::__atomic_order_to_int(__failure),
        _Sco{});),
     NV_IS_HOST,
     (__result = __atomic_compare_exchange_weak_host(__a->get(), __expected, __val, __success, __failure);))
@@ -165,7 +165,7 @@ _CCCL_API auto __atomic_fetch_add_dispatch(_Sto* __a, _Up __delta, memory_order
 {
   NV_DISPATCH_TARGET(
     NV_IS_DEVICE,
-    (return __atomic_fetch_add_cuda(__a->get(), __delta, static_cast<__memory_order_underlying_t>(__order), _Sco{});),
+    (return __atomic_fetch_add_cuda(__a->get(), __delta, ::cuda::std::__atomic_order_to_int(__order), _Sco{});),
     NV_IS_HOST,
     (return __atomic_fetch_add_host(__a->get(), __delta, __order);))
 }
@@ -176,7 +176,7 @@ _CCCL_API auto __atomic_fetch_sub_dispatch(_Sto* __a, _Up __delta, memory_order
 {
   NV_DISPATCH_TARGET(
     NV_IS_DEVICE,
-    (return __atomic_fetch_sub_cuda(__a->get(), __delta, static_cast<__memory_order_underlying_t>(__order), _Sco{});),
+    (return __atomic_fetch_sub_cuda(__a->get(), __delta, ::cuda::std::__atomic_order_to_int(__order), _Sco{});),
     NV_IS_HOST,
     (return __atomic_fetch_sub_host(__a->get(), __delta, __order);))
 }
@@ -187,7 +187,7 @@ _CCCL_API auto __atomic_fetch_and_dispatch(_Sto* __a, _Up __pattern, memory_orde
 {
   NV_DISPATCH_TARGET(
     NV_IS_DEVICE,
-    (return __atomic_fetch_and_cuda(__a->get(), __pattern, static_cast<__memory_order_underlying_t>(__order), _Sco{});),
+    (return __atomic_fetch_and_cuda(__a->get(), __pattern, ::cuda::std::__atomic_order_to_int(__order), _Sco{});),
     NV_IS_HOST,
     (return __atomic_fetch_and_host(__a->get(), __pattern, __order);))
 }
@@ -198,7 +198,7 @@ _CCCL_API auto __atomic_fetch_or_dispatch(_Sto* __a, _Up __pattern, memory_order
 {
   NV_DISPATCH_TARGET(
     NV_IS_DEVICE,
-    (return __atomic_fetch_or_cuda(__a->get(), __pattern, static_cast<__memory_order_underlying_t>(__order), _Sco{});),
+    (return __atomic_fetch_or_cuda(__a->get(), __pattern, ::cuda::std::__atomic_order_to_int(__order), _Sco{});),
     NV_IS_HOST,
     (return __atomic_fetch_or_host(__a->get(), __pattern, __order);))
 }
@@ -209,7 +209,7 @@ _CCCL_API auto __atomic_fetch_xor_dispatch(_Sto* __a, _Up __pattern, memory_orde
 {
   NV_DISPATCH_TARGET(
     NV_IS_DEVICE,
-    (return __atomic_fetch_xor_cuda(__a->get(), __pattern, static_cast<__memory_order_underlying_t>(__order), _Sco{});),
+    (return __atomic_fetch_xor_cuda(__a->get(), __pattern, ::cuda::std::__atomic_order_to_int(__order), _Sco{});),
     NV_IS_HOST,
     (return __atomic_fetch_xor_host(__a->get(), __pattern, __order);))
 }
@@ -220,7 +220,7 @@ _CCCL_API auto __atomic_fetch_max_dispatch(_Sto* __a, _Up __val, memory_order __
 {
   NV_IF_TARGET(
     NV_IS_DEVICE,
-    (return __atomic_fetch_max_cuda(__a->get(), __val, static_cast<__memory_order_underlying_t>(__order), _Sco{});),
+    (return __atomic_fetch_max_cuda(__a->get(), __val, ::cuda::std::__atomic_order_to_int(__order), _Sco{});),
     (return __atomic_fetch_max_host(__a->get(), __val, __order);))
 }
 
@@ -230,7 +230,7 @@ _CCCL_API auto __atomic_fetch_min_dispatch(_Sto* __a, _Up __val, memory_order __
 {
   NV_IF_TARGET(
     NV_IS_DEVICE,
-    (return __atomic_fetch_min_cuda(__a->get(), __val, static_cast<__memory_order_underlying_t>(__order), _Sco{});),
+    (return __atomic_fetch_min_cuda(__a->get(), __val, ::cuda::std::__atomic_order_to_int(__order), _Sco{});),
     (return __atomic_fetch_min_host(__a->get(), __val, __order);))
 }
 
diff --git a/libcudacxx/include/cuda/std/__cccl/preprocessor.h b/libcudacxx/include/cuda/std/__cccl/preprocessor.h
index 8f40726c10c..61610a2941f 100644
--- a/libcudacxx/include/cuda/std/__cccl/preprocessor.h
+++ b/libcudacxx/include/cuda/std/__cccl/preprocessor.h
@@ -38,9 +38,13 @@ CCCL_IGNORE_MSVC_TRADITIONAL_PREPROCESSOR_WARNING to suppress this warning.
 #define _CCCL_PP_SECOND(_, second, ...) second
 #define _CCCL_PP_THIRD(_1, _2, third)   third
 
-#define _CCCL_PP_EXPAND(...) __VA_ARGS__
+#define _CCCL_PP_EXPAND1(...) __VA_ARGS__
+#define _CCCL_PP_EXPAND(...)  _CCCL_PP_EXPAND1(_CCCL_PP_EXPAND1(__VA_ARGS__))
 #define _CCCL_PP_EAT(...)
 
+#define _CCCL_PP_DEFER(id)     id _CCCL_PP_EAT()
+#define _CCCL_PP_OBSTRUCT(...) __VA_ARGS__ _CCCL_PP_DEFER(_CCCL_PP_EAT)()
+
 #define _CCCL_PP_CAT_(_Xp, ...) _Xp##__VA_ARGS__
 #define _CCCL_PP_CAT(_Xp, ...)  _CCCL_PP_CAT_(_Xp, __VA_ARGS__)
 
diff --git a/libcudacxx/include/cuda/std/atomic b/libcudacxx/include/cuda/std/atomic
index b6619996d43..221c699dc5d 100644
--- a/libcudacxx/include/cuda/std/atomic
+++ b/libcudacxx/include/cuda/std/atomic
@@ -559,7 +559,6 @@ atomic_fetch_xor_explicit(atomic<_Tp>* __o, _Tp __op, memory_order __m) noexcept
 }
 
 // flag type and operations
-
 struct atomic_flag
 {
   __atomic_storage_t<_CCCL_ATOMIC_FLAG_TYPE> __a;
@@ -730,7 +729,6 @@ _CCCL_API inline void atomic_flag_notify_all(atomic_flag* __o) noexcept
 }
 
 #endif
-
 // fences
 
 _CCCL_API inline void atomic_thread_fence(memory_order __m) noexcept