Merge branch 'main' into stf_deprecate_scheduler

caugonnet · web-flow · commit 52591c28ddb7 · 2026-03-27T00:15:42.000+01:00
diff --git a/cudax/examples/stf/linear_algebra/cg_dense_2D.cu b/cudax/examples/stf/linear_algebra/cg_dense_2D.cu
@@ -29,7 +29,8 @@ public:
   {
     h_addr.reset(new double[N * N]);
     cuda_safe_call(cudaHostRegister(h_addr.get(), N * N * sizeof(double), cudaHostRegisterPortable));
-    handle = to_shared(ctx.logical_data(make_slice(h_addr.get(), std::tuple{N, N}, N)));
+    handle = ::std::make_shared<logical_data<slice<double, 2>>>(
+      ctx.logical_data(make_slice(h_addr.get(), std::tuple{N, N}, N)));
   }
 
   void fill(const std::function<double(int, int)>& f)
@@ -68,7 +69,7 @@ public:
       for (size_t b = 0; b < nblocks; b++)
       {
         size_t bs  = std::min(N - block_size * b, block_size);
-        handles[b] = to_shared(ctx.logical_data(shape_of<slice<double>>(bs)));
+        handles[b] = ::std::make_shared<logical_data<slice<double>>>(ctx.logical_data(shape_of<slice<double>>(bs)));
       }
     }
     else
@@ -77,8 +78,9 @@ public:
       cuda_safe_call(cudaHostRegister(h_addr.get(), N * sizeof(double), cudaHostRegisterPortable));
       for (size_t b = 0; b < nblocks; b++)
       {
-        size_t bs  = std::min(N - block_size * b, block_size);
-        handles[b] = to_shared(ctx.logical_data(make_slice(&h_addr[block_size * b], bs)));
+        size_t bs = std::min(N - block_size * b, block_size);
+        handles[b] =
+          ::std::make_shared<logical_data<slice<double>>>(ctx.logical_data(make_slice(&h_addr[block_size * b], bs)));
       }
     }
   }
@@ -94,7 +96,7 @@ public:
     for (size_t b = 0; b < nblocks; b++)
     {
       size_t bs  = std::min(N - block_size * b, block_size);
-      handles[b] = to_shared(ctx.logical_data(shape_of<slice<double>>(bs)));
+      handles[b] = ::std::make_shared<logical_data<slice<double>>>(ctx.logical_data(shape_of<slice<double>>(bs)));
 
       ctx.task(handles[b]->write(), a.handles[b]->read())->*[bs](cudaStream_t stream, auto dthis, auto da) {
         // There are likely much more efficient ways.
@@ -154,13 +156,13 @@ public:
     if (is_tmp)
     {
       // There is no physical backing for this temporary vector
-      handle = to_shared(ctx.logical_data(shape_of<slice<double>>(1)));
+      handle = ::std::make_shared<logical_data<slice<double>>>(ctx.logical_data(shape_of<slice<double>>(1)));
     }
     else
     {
       h_addr.reset(new double);
       cuda_safe_call(cudaHostRegister(h_addr.get(), s, cudaHostRegisterPortable));
-      handle = to_shared(ctx.logical_data(make_slice(h_addr.get(), 1)));
+      handle = ::std::make_shared<logical_data<slice<double>>>(ctx.logical_data(make_slice(h_addr.get(), 1)));
     }
   }
 
@@ -170,7 +172,7 @@ public:
   // Copy constructor
   scalar(const scalar& a)
   {
-    handle = to_shared(ctx.logical_data(shape_of<slice<double>>(1)));
+    handle = ::std::make_shared<logical_data<slice<double>>>(ctx.logical_data(shape_of<slice<double>>(1)));
 
     ctx.task(handle->write(), a.handle->read())->*[](cudaStream_t stream, auto dthis, auto da) {
       // There are likely much more efficient ways.
diff --git a/cudax/include/cuda/experimental/__stf/internal/cooperative_group_system.cuh b/cudax/include/cuda/experimental/__stf/internal/cooperative_group_system.cuh
@@ -20,7 +20,6 @@
 #  pragma system_header
 #endif // no system header
 
-#include <cuda/experimental/__stf/utility/cuda_attributes.cuh>
 #if _CCCL_CUDA_COMPILATION()
 #  include <cooperative_groups.h>
 #endif // _CCCL_CUDA_COMPILATION()
diff --git a/cudax/include/cuda/experimental/__stf/internal/execution_policy.cuh b/cudax/include/cuda/experimental/__stf/internal/execution_policy.cuh
@@ -24,7 +24,6 @@
 #endif // no system header
 
 #include <cuda/experimental/__stf/utility/core.cuh>
-#include <cuda/experimental/__stf/utility/cuda_attributes.cuh>
 #include <cuda/experimental/__stf/utility/cuda_safe_call.cuh>
 
 #include <cassert>
diff --git a/cudax/include/cuda/experimental/__stf/internal/hashtable_linearprobing.cuh b/cudax/include/cuda/experimental/__stf/internal/hashtable_linearprobing.cuh
@@ -27,7 +27,6 @@
 #  pragma system_header
 #endif // no system header
 
-#include <cuda/experimental/__stf/utility/cuda_attributes.cuh>
 #include <cuda/experimental/__stf/utility/cuda_safe_call.cuh>
 #include <cuda/experimental/__stf/utility/hash.cuh>
 
diff --git a/cudax/include/cuda/experimental/__stf/internal/parallel_for_scope.cuh b/cudax/include/cuda/experimental/__stf/internal/parallel_for_scope.cuh
@@ -61,9 +61,9 @@ __global__ void loop(const _CCCL_GRID_CONSTANT size_t n, shape_t shape, F f, tup
 
   // This will explode the targs tuple into a pack of data
   // Help the compiler which may not detect that a device lambda is calling a device lambda
-  CUDASTF_NO_DEVICE_STACK
+  _CCCL_DIAG_SUPPRESS_NVHPC(no_device_stack)
   auto const explode_args = [&](auto&... data) {
-    CUDASTF_NO_DEVICE_STACK
+    _CCCL_DIAG_SUPPRESS_NVHPC(no_device_stack)
     auto const explode_coords = [&](auto&&... coords) {
       // No move/forward for `data` because it's used multiple times.
       f(::std::forward<decltype(coords)>(coords)..., data...);
@@ -303,9 +303,9 @@ __global__ void loop_redux(
   // This is used to build the arguments passed to the user-provided lambda function.
 
   // Help the compiler which may not detect that a device lambda is calling a device lambda
-  CUDASTF_NO_DEVICE_STACK
+  _CCCL_DIAG_SUPPRESS_NVHPC(no_device_stack)
   const auto explode_args = [&](auto&&... data) {
-    CUDASTF_NO_DEVICE_STACK
+    _CCCL_DIAG_SUPPRESS_NVHPC(no_device_stack)
     const auto explode_coords = [&](auto&&... coords) {
       // No move/forward for `data` because it's used multiple times.
       f(::std::forward<decltype(coords)>(coords)..., data...);
diff --git a/cudax/include/cuda/experimental/__stf/utility/cartesian_iterator.cuh b/cudax/include/cuda/experimental/__stf/utility/cartesian_iterator.cuh
@@ -20,7 +20,6 @@
 #  pragma system_header
 #endif // no system header
 
-#include <cuda/experimental/__stf/utility/cuda_attributes.cuh>
 #include <cuda/experimental/__stf/utility/unittest.cuh>
 
 namespace cuda::experimental::stf::reserved
diff --git a/cudax/include/cuda/experimental/__stf/utility/core.cuh b/cudax/include/cuda/experimental/__stf/utility/core.cuh
@@ -24,13 +24,9 @@
 #  pragma system_header
 #endif // no system header
 
-#include <cuda/experimental/__stf/utility/cuda_attributes.cuh>
-
 #include <cstddef>
 #include <functional>
 #include <limits>
-#include <memory>
-#include <string>
 #include <tuple>
 #include <type_traits>
 #include <utility>
@@ -98,38 +94,6 @@ _CCCL_HOST_DEVICE constexpr decltype(auto) mv(T&& obj)
 }
 #endif // _CCCL_DOXYGEN_INVOKED
 
-/**
- * @brief Creates a `std::shared_ptr` managing a copy of the given object.
- *
- * This function takes an object of any type and returns a `std::shared_ptr`
- * that manages a copy of that object. If the object is an lvalue reference,
- * it will be copied into the `shared_ptr`. If the object is an rvalue reference,
- * it will be moved into the `shared_ptr`.
- *
- * The type managed by the `shared_ptr` has all references and `const`/`volatile`
- * qualifiers removed from the original type.
- *
- * @tparam T The type of the object, deduced automatically. May be an lvalue or rvalue reference.
- * @param obj The object to copy into the instance managed by the `shared_ptr`.
- * @return A `std::shared_ptr` managing a new copy of the object.
- *
- * @note This function simplifies the creation of `std::shared_ptr`s by handling
- * the type deduction and appropriate forwarding of the object. It's particularly
- * useful when you want to create a `shared_ptr` from temporary objects or when
- * the object's type includes references or cv-qualifiers.
- *
- * @code
- * int value = 42;
- * auto sp1 = to_shared(value);            // New shared_ptr<int>
- * assert(*sp1 == 42);                     // sp1 points to an int valued at 42
- * @endcode
- */
-template <typename T>
-auto to_shared(T&& obj)
-{
-  return ::std::make_shared<::std::remove_cv_t<::std::remove_reference_t<T>>>(::std::forward<T>(obj));
-}
-
 /**
  * @brief   Create an iterable range from 'from' to 'to'
  *
diff --git a/cudax/include/cuda/experimental/__stf/utility/cuda_attributes.cuh b/cudax/include/cuda/experimental/__stf/utility/cuda_attributes.cuh
diff --git a/cudax/include/cuda/experimental/__stf/utility/dimensions.cuh b/cudax/include/cuda/experimental/__stf/utility/dimensions.cuh
@@ -25,7 +25,6 @@
 #  pragma system_header
 #endif // no system header
 
-#include <cuda/experimental/__stf/utility/cuda_attributes.cuh>
 #include <cuda/experimental/__stf/utility/hash.cuh>
 #include <cuda/experimental/__stf/utility/unittest.cuh>
 
@@ -491,7 +490,7 @@ public:
   _CCCL_HOST_DEVICE coords_t index_to_coords(size_t index) const
   {
     // Help the compiler which may not detect that a device lambda is calling a device lambda
-    CUDASTF_NO_DEVICE_STACK
+    _CCCL_DIAG_SUPPRESS_NVHPC(no_device_stack)
     return make_tuple_indexwise<dimensions>([&](auto i) {
       // included
       const ::std::ptrdiff_t begin_i  = get_begin(i);
@@ -500,7 +499,7 @@ public:
       index /= extent_i;
       return result;
     });
-    CUDASTF_NO_DEVICE_STACK
+    _CCCL_DIAG_SUPPRESS_NVHPC(no_device_stack)
   }
 
 private:

Original file line number	Diff line number	Diff line change
`@@ -29,7 +29,8 @@ public:`
`29`	`29`	`{`
`30`	`30`	`h_addr.reset(new double[N * N]);`
`31`	`31`	`cuda_safe_call(cudaHostRegister(h_addr.get(), N * N * sizeof(double), cudaHostRegisterPortable));`
`32`		`- handle = to_shared(ctx.logical_data(make_slice(h_addr.get(), std::tuple{N, N}, N)));`
	`32`	`+ handle = ::std::make_shared<logical_data<slice<double, 2>>>(`
	`33`	`+ ctx.logical_data(make_slice(h_addr.get(), std::tuple{N, N}, N)));`
`33`	`34`	`}`
`34`	`35`
`35`	`36`	`void fill(const std::function<double(int, int)>& f)`
`@@ -68,7 +69,7 @@ public:`
`68`	`69`	`for (size_t b = 0; b < nblocks; b++)`
`69`	`70`	`{`
`70`	`71`	`size_t bs = std::min(N - block_size * b, block_size);`
`71`		`- handles[b] = to_shared(ctx.logical_data(shape_of<slice<double>>(bs)));`
	`72`	`+ handles[b] = ::std::make_shared<logical_data<slice<double>>>(ctx.logical_data(shape_of<slice<double>>(bs)));`
`72`	`73`	`}`
`73`	`74`	`}`
`74`	`75`	`else`
`@@ -77,8 +78,9 @@ public:`
`77`	`78`	`cuda_safe_call(cudaHostRegister(h_addr.get(), N * sizeof(double), cudaHostRegisterPortable));`
`78`	`79`	`for (size_t b = 0; b < nblocks; b++)`
`79`	`80`	`{`
`80`		`- size_t bs = std::min(N - block_size * b, block_size);`
`81`		`- handles[b] = to_shared(ctx.logical_data(make_slice(&h_addr[block_size * b], bs)));`
	`81`	`+ size_t bs = std::min(N - block_size * b, block_size);`
	`82`	`+ handles[b] =`
	`83`	`+ ::std::make_shared<logical_data<slice<double>>>(ctx.logical_data(make_slice(&h_addr[block_size * b], bs)));`
`82`	`84`	`}`
`83`	`85`	`}`
`84`	`86`	`}`
`@@ -94,7 +96,7 @@ public:`
`94`	`96`	`for (size_t b = 0; b < nblocks; b++)`
`95`	`97`	`{`
`96`	`98`	`size_t bs = std::min(N - block_size * b, block_size);`
`97`		`- handles[b] = to_shared(ctx.logical_data(shape_of<slice<double>>(bs)));`
	`99`	`+ handles[b] = ::std::make_shared<logical_data<slice<double>>>(ctx.logical_data(shape_of<slice<double>>(bs)));`
`98`	`100`
`99`	`101`	`ctx.task(handles[b]->write(), a.handles[b]->read())->*[bs](cudaStream_t stream, auto dthis, auto da) {`
`100`	`102`	`// There are likely much more efficient ways.`
`@@ -154,13 +156,13 @@ public:`
`154`	`156`	`if (is_tmp)`
`155`	`157`	`{`
`156`	`158`	`// There is no physical backing for this temporary vector`
`157`		`- handle = to_shared(ctx.logical_data(shape_of<slice<double>>(1)));`
	`159`	`+ handle = ::std::make_shared<logical_data<slice<double>>>(ctx.logical_data(shape_of<slice<double>>(1)));`
`158`	`160`	`}`
`159`	`161`	`else`
`160`	`162`	`{`
`161`	`163`	`h_addr.reset(new double);`
`162`	`164`	`cuda_safe_call(cudaHostRegister(h_addr.get(), s, cudaHostRegisterPortable));`
`163`		`- handle = to_shared(ctx.logical_data(make_slice(h_addr.get(), 1)));`
	`165`	`+ handle = ::std::make_shared<logical_data<slice<double>>>(ctx.logical_data(make_slice(h_addr.get(), 1)));`
`164`	`166`	`}`
`165`	`167`	`}`
`166`	`168`
`@@ -170,7 +172,7 @@ public:`
`170`	`172`	`// Copy constructor`
`171`	`173`	`scalar(const scalar& a)`
`172`	`174`	`{`
`173`		`- handle = to_shared(ctx.logical_data(shape_of<slice<double>>(1)));`
	`175`	`+ handle = ::std::make_shared<logical_data<slice<double>>>(ctx.logical_data(shape_of<slice<double>>(1)));`
`174`	`176`
`175`	`177`	`ctx.task(handle->write(), a.handle->read())->*[](cudaStream_t stream, auto dthis, auto da) {`
`176`	`178`	`// There are likely much more efficient ways.`