alpaka-group · AuroraPerego · May 27, 2025 · Apr 3, 2025 · Apr 6, 2025 · Apr 7, 2025
diff --git a/include/alpaka/dev/DevCpu.hpp b/include/alpaka/dev/DevCpu.hpp
@@ -6,7 +6,7 @@
 #pragma once
 
 #include "alpaka/dev/Traits.hpp"
-#include "alpaka/dev/common/QueueRegistry.hpp"
+#include "alpaka/dev/common/DevGenericImpl.hpp"
 #include "alpaka/dev/cpu/SysInfo.hpp"
 #include "alpaka/mem/buf/Traits.hpp"
 #include "alpaka/platform/Traits.hpp"
@@ -46,7 +46,7 @@ namespace alpaka
     namespace cpu::detail
     {
         //! The CPU device implementation.
-        using DevCpuImpl = alpaka::detail::QueueRegistry<cpu::ICpuQueue>;
+        using DevCpuImpl = alpaka::detail::DevGenericImpl<cpu::ICpuQueue>;
     } // namespace cpu::detail
 
     //! The CPU device handle.
@@ -89,29 +89,42 @@ namespace alpaka
             return 0;
         }
 
+        static void setDeviceProperties(alpaka::DevCpu const&, alpaka::DeviceProperties& devProperties)
+        {
+            devProperties.name = cpu::detail::getCpuName();
+            devProperties.totalGlobalMem = cpu::detail::getTotalGlobalMemSizeBytes();
+        }
+
+        friend struct trait::GetName<DevCpu>;
+        friend struct trait::GetMemBytes<DevCpu>;
+        friend struct trait::GetFreeMemBytes<DevCpu>;
+        friend struct trait::GetWarpSizes<DevCpu>;
+        friend struct trait::GetPreferredWarpSize<DevCpu>;
+
     private:
         std::shared_ptr<cpu::detail::DevCpuImpl> m_spDevCpuImpl;
     };
 
     namespace trait
     {
+
         //! The CPU device name get trait specialization.
         template<>
         struct GetName<DevCpu>
         {
-            ALPAKA_FN_HOST static auto getName(DevCpu const& /* dev */) -> std::string
+            ALPAKA_FN_HOST static auto getName(DevCpu const& dev) -> std::string
             {
-                return cpu::detail::getCpuName();
+                return dev.m_spDevCpuImpl->deviceProperties(dev)->name;
             }
         };
 
         //! The CPU device available memory get trait specialization.
         template<>
         struct GetMemBytes<DevCpu>
         {
-            ALPAKA_FN_HOST static auto getMemBytes(DevCpu const& /* dev */) -> std::size_t
+            ALPAKA_FN_HOST static auto getMemBytes(DevCpu const& dev) -> std::size_t
             {
-                return cpu::detail::getTotalGlobalMemSizeBytes();
+                return dev.m_spDevCpuImpl->deviceProperties(dev)->totalGlobalMem;
             }
         };
 

diff --git a/include/alpaka/dev/DevGenericSycl.hpp b/include/alpaka/dev/DevGenericSycl.hpp
@@ -9,6 +9,7 @@
 #include "alpaka/core/Common.hpp"
 #include "alpaka/core/Sycl.hpp"
 #include "alpaka/dev/Traits.hpp"
+#include "alpaka/dev/common/DeviceProperties.hpp"
 #include "alpaka/mem/buf/Traits.hpp"
 #include "alpaka/platform/Traits.hpp"
 #include "alpaka/queue/Properties.hpp"
@@ -112,11 +113,41 @@ namespace alpaka
                 return m_context;
             }
 
+            auto deviceProperties() -> std::optional<alpaka::DeviceProperties>&
+            {
+                std::call_once(
+                    m_onceFlag,
+                    [&]()
+                    {
+                        m_deviceProperties = std::make_optional<alpaka::DeviceProperties>();
+                        auto const& device = this->get_device();
+                        m_deviceProperties->name = device.template get_info<sycl::info::device::name>();
+                        m_deviceProperties->totalGlobalMem
+                            = device.template get_info<sycl::info::device::global_mem_size>();
+
+                        std::vector<std::size_t> warp_sizes
+                            = device.template get_info<sycl::info::device::sub_group_sizes>();
+                        // The CPU runtime supports a sub-group size of 64, but the SYCL implementation currently
+                        // does not
+                        auto find64 = std::find(warp_sizes.begin(), warp_sizes.end(), 64);
+                        if(find64 != warp_sizes.end())
+                            warp_sizes.erase(find64);
+                        // Sort the warp sizes in decreasing order
+                        std::sort(warp_sizes.begin(), warp_sizes.end(), std::greater<>{});
+                        m_deviceProperties->warpSizes = std::move(warp_sizes);
+                        m_deviceProperties->preferredWarpSize = m_deviceProperties->warpSizes.front();
+                    });
+
+                return m_deviceProperties;
+            }
+
         private:
             sycl::device m_device;
             sycl::context m_context;
             std::vector<std::weak_ptr<QueueGenericSyclImpl>> m_queues;
+            std::optional<alpaka::DeviceProperties> m_deviceProperties;
             std::shared_mutex mutable m_mutex;
+            std::once_flag m_onceFlag;
         };
     } // namespace detail
 
@@ -154,14 +185,14 @@ namespace alpaka
 
     namespace trait
     {
+
         //! The SYCL device name get trait specialization.
         template<concepts::Tag TTag>
         struct GetName<DevGenericSycl<TTag>>
         {
             static auto getName(DevGenericSycl<TTag> const& dev) -> std::string
             {
-                auto const device = dev.getNativeHandle().first;
-                return device.template get_info<sycl::info::device::name>();
+                return dev.m_impl->deviceProperties()->name;
             }
         };
 
@@ -171,8 +202,7 @@ namespace alpaka
         {
             static auto getMemBytes(DevGenericSycl<TTag> const& dev) -> std::size_t
             {
-                auto const device = dev.getNativeHandle().first;
-                return device.template get_info<sycl::info::device::global_mem_size>();
+                return dev.m_impl->deviceProperties()->totalGlobalMem;
             }
         };
 
@@ -195,15 +225,7 @@ namespace alpaka
         {
             static auto getWarpSizes(DevGenericSycl<TTag> const& dev) -> std::vector<std::size_t>
             {
-                auto const device = dev.getNativeHandle().first;
-                std::vector<std::size_t> warp_sizes = device.template get_info<sycl::info::device::sub_group_sizes>();
-                // The CPU runtime supports a sub-group size of 64, but the SYCL implementation currently does not
-                auto find64 = std::find(warp_sizes.begin(), warp_sizes.end(), 64);
-                if(find64 != warp_sizes.end())
-                    warp_sizes.erase(find64);
-                // Sort the warp sizes in decreasing order
-                std::sort(warp_sizes.begin(), warp_sizes.end(), std::greater<>{});
-                return warp_sizes;
+                return dev.m_impl->deviceProperties()->warpSizes;
             }
         };
 
@@ -213,7 +235,7 @@ namespace alpaka
         {
             static auto getPreferredWarpSize(DevGenericSycl<TTag> const& dev) -> std::size_t
             {
-                return GetWarpSizes<DevGenericSycl<TTag>>::getWarpSizes(dev).front();
+                return dev.m_impl->deviceProperties()->preferredWarpSize;
             }
         };
 

diff --git a/include/alpaka/dev/DevUniformCudaHipRt.hpp b/include/alpaka/dev/DevUniformCudaHipRt.hpp
@@ -10,7 +10,8 @@
 #include "alpaka/core/Hip.hpp"
 #include "alpaka/core/Interface.hpp"
 #include "alpaka/dev/Traits.hpp"
-#include "alpaka/dev/common/QueueRegistry.hpp"
+#include "alpaka/dev/common/DevGenericImpl.hpp"
+#include "alpaka/dev/common/DeviceProperties.hpp"
 #include "alpaka/mem/buf/Traits.hpp"
 #include "alpaka/platform/Traits.hpp"
 #include "alpaka/queue/Properties.hpp"
@@ -20,13 +21,18 @@
 #include "alpaka/wait/Traits.hpp"
 
 #include <cstddef>
+#include <mutex>
 #include <string>
 #include <vector>
 
 #if defined(ALPAKA_ACC_GPU_CUDA_ENABLED) || defined(ALPAKA_ACC_GPU_HIP_ENABLED)
 
 namespace alpaka
 {
+
+    template<typename TApi>
+    class DevUniformCudaHipRt;
+
     namespace trait
     {
         template<typename TPlatform, typename TSfinae>
@@ -62,7 +68,7 @@ namespace alpaka
         using IDeviceQueue = uniform_cuda_hip::detail::QueueUniformCudaHipRtImpl<TApi>;
 
     protected:
-        DevUniformCudaHipRt() : m_QueueRegistry{std::make_shared<alpaka::detail::QueueRegistry<IDeviceQueue>>()}
+        DevUniformCudaHipRt() : m_DevGenericImpl{std::make_shared<alpaka::detail::DevGenericImpl<IDeviceQueue>>()}
         {
         }
 
@@ -84,42 +90,68 @@ namespace alpaka
 
         [[nodiscard]] ALPAKA_FN_HOST auto getAllQueues() const -> std::vector<std::shared_ptr<IDeviceQueue>>
         {
-            return m_QueueRegistry->getAllExistingQueues();
+            return m_DevGenericImpl->getAllExistingQueues();
         }
 
         //! Registers the given queue on this device.
         //! NOTE: Every queue has to be registered for correct functionality of device wait operations!
         ALPAKA_FN_HOST auto registerQueue(std::shared_ptr<IDeviceQueue> spQueue) const -> void
         {
-            m_QueueRegistry->registerQueue(spQueue);
+            m_DevGenericImpl->registerQueue(spQueue);
+        }
+
+        static void setDeviceProperties(
+            DevUniformCudaHipRt<TApi> const& device,
+            alpaka::DeviceProperties& devProperties)
+        {
+            // There is cuda/hip-DeviceGetAttribute as faster alternative to
+            // cuda/hip-GetDeviceProperties to get a single device property but it has no option to get
+            // the name
+            auto devHandle = device.getNativeHandle();
+            typename TApi::DeviceProp_t devProp;
+            ALPAKA_UNIFORM_CUDA_HIP_RT_CHECK(TApi::getDeviceProperties(&devProp, devHandle));
+            devProperties.name = std::string(devProp.name);
+
+            std::size_t freeInternal(0u);
+            std::size_t totalInternal(0u);
+            ALPAKA_UNIFORM_CUDA_HIP_RT_CHECK(TApi::memGetInfo(&freeInternal, &totalInternal));
+            devProperties.totalGlobalMem = totalInternal;
+
+            int warpSize = 0;
+            ALPAKA_UNIFORM_CUDA_HIP_RT_CHECK(
+                TApi::deviceGetAttribute(&warpSize, TApi::deviceAttributeWarpSize, devHandle));
+            devProperties.warpSizes = std::vector<std::size_t>{static_cast<std::size_t>(warpSize)};
+            devProperties.preferredWarpSize = static_cast<std::size_t>(warpSize);
         }
 
+        friend struct trait::GetName<DevUniformCudaHipRt<TApi>>;
+        friend struct trait::GetMemBytes<DevUniformCudaHipRt<TApi>>;
+        friend struct trait::GetFreeMemBytes<DevUniformCudaHipRt<TApi>>;
+        friend struct trait::GetWarpSizes<DevUniformCudaHipRt<TApi>>;
+        friend struct trait::GetPreferredWarpSize<DevUniformCudaHipRt<TApi>>;
+
     private:
         DevUniformCudaHipRt(int iDevice)
             : m_iDevice(iDevice)
-            , m_QueueRegistry(std::make_shared<alpaka::detail::QueueRegistry<IDeviceQueue>>())
+            , m_DevGenericImpl(std::make_shared<alpaka::detail::DevGenericImpl<IDeviceQueue>>())
         {
         }
 
         int m_iDevice;
 
-        std::shared_ptr<alpaka::detail::QueueRegistry<IDeviceQueue>> m_QueueRegistry;
+        std::shared_ptr<alpaka::detail::DevGenericImpl<IDeviceQueue>> m_DevGenericImpl;
     };
 
     namespace trait
     {
+
         //! The CUDA/HIP RT device name get trait specialization.
         template<typename TApi>
         struct GetName<DevUniformCudaHipRt<TApi>>
         {
             ALPAKA_FN_HOST static auto getName(DevUniformCudaHipRt<TApi> const& dev) -> std::string
             {
-                // There is cuda/hip-DeviceGetAttribute as faster alternative to cuda/hip-GetDeviceProperties to get a
-                // single device property but it has no option to get the name
-                typename TApi::DeviceProp_t devProp;
-                ALPAKA_UNIFORM_CUDA_HIP_RT_CHECK(TApi::getDeviceProperties(&devProp, dev.getNativeHandle()));
-
-                return std::string(devProp.name);
+                return dev.m_DevGenericImpl->deviceProperties(dev)->name;
             }
         };
 
@@ -129,15 +161,7 @@ namespace alpaka
         {
             ALPAKA_FN_HOST static auto getMemBytes(DevUniformCudaHipRt<TApi> const& dev) -> std::size_t
             {
-                // Set the current device to wait for.
-                ALPAKA_UNIFORM_CUDA_HIP_RT_CHECK(TApi::setDevice(dev.getNativeHandle()));
-
-                std::size_t freeInternal(0u);
-                std::size_t totalInternal(0u);
-
-                ALPAKA_UNIFORM_CUDA_HIP_RT_CHECK(TApi::memGetInfo(&freeInternal, &totalInternal));
-
-                return totalInternal;
+                return dev.m_DevGenericImpl->deviceProperties(dev)->totalGlobalMem;
             }
         };
 
@@ -147,12 +171,9 @@ namespace alpaka
         {
             ALPAKA_FN_HOST static auto getFreeMemBytes(DevUniformCudaHipRt<TApi> const& dev) -> std::size_t
             {
-                // Set the current device to wait for.
                 ALPAKA_UNIFORM_CUDA_HIP_RT_CHECK(TApi::setDevice(dev.getNativeHandle()));
-
                 std::size_t freeInternal(0u);
                 std::size_t totalInternal(0u);
-
                 ALPAKA_UNIFORM_CUDA_HIP_RT_CHECK(TApi::memGetInfo(&freeInternal, &totalInternal));
 
                 return freeInternal;
@@ -165,7 +186,7 @@ namespace alpaka
         {
             ALPAKA_FN_HOST static auto getWarpSizes(DevUniformCudaHipRt<TApi> const& dev) -> std::vector<std::size_t>
             {
-                return {GetPreferredWarpSize<DevUniformCudaHipRt<TApi>>::getPreferredWarpSize(dev)};
+                return dev.m_DevGenericImpl->deviceProperties(dev)->warpSizes;
             }
         };
 
@@ -175,11 +196,7 @@ namespace alpaka
         {
             ALPAKA_FN_HOST static auto getPreferredWarpSize(DevUniformCudaHipRt<TApi> const& dev) -> std::size_t
             {
-                int warpSize = 0;
-
-                ALPAKA_UNIFORM_CUDA_HIP_RT_CHECK(
-                    TApi::deviceGetAttribute(&warpSize, TApi::deviceAttributeWarpSize, dev.getNativeHandle()));
-                return static_cast<std::size_t>(warpSize);
+                return dev.m_DevGenericImpl->deviceProperties(dev)->preferredWarpSize;
             }
         };
 

diff --git a/include/alpaka/dev/common/QueueRegistry.hpp → include/alpaka/dev/common/DevGenericImpl.hpp b/include/alpaka/dev/common/QueueRegistry.hpp → include/alpaka/dev/common/DevGenericImpl.hpp
@@ -5,19 +5,22 @@
 #pragma once
 
 #include "alpaka/core/Common.hpp"
+#include "alpaka/dev/common/DeviceProperties.hpp"
 
 #include <deque>
 #include <functional>
 #include <memory>
 #include <mutex>
+#include <optional>
 
 namespace alpaka::detail
 {
+
     //! The CPU/GPU device queue registry implementation.
     //!
     //! @tparam TQueue queue implementation
     template<typename TQueue>
-    struct QueueRegistry
+    struct DevGenericImpl
     {
         ALPAKA_FN_HOST auto getAllExistingQueues() const -> std::vector<std::shared_ptr<TQueue>>
         {
@@ -52,8 +55,24 @@ namespace alpaka::detail
             m_queues.push_back(spQueue);
         }
 
+        template<typename TDev>
+        auto deviceProperties(TDev const& device) -> std::optional<alpaka::DeviceProperties>&
+        {
+            std::call_once(
+                m_onceFlag,
+                [&]() noexcept
+                {
+                    m_deviceProperties = std::make_optional<alpaka::DeviceProperties>();
+                    TDev::setDeviceProperties(device, *m_deviceProperties);
+                });
+
+            return m_deviceProperties;
+        }
+
     private:
         std::mutex mutable m_Mutex;
+        std::once_flag m_onceFlag;
+        std::optional<alpaka::DeviceProperties> m_deviceProperties;
         std::deque<std::weak_ptr<TQueue>> mutable m_queues;
     };
 } // namespace alpaka::detail