Skip to content

Commit 5bc1579

Browse files
authored
Implement cache of device properties (#2492)
* Cache device properties after first API calls * Make caching of device properties thread-safe * Move device properties inside QueueRegistry * Use static variables and call_once instead of locks * Set all properties on first once_call * Define functions to be used inside call_once lambda * Encaplusate set of dev props inside QueueRegistry/DevSyclImpl * Template `setDeviceProperties` and specialize for cpu * Pass device to `setDeviceProperties` * Rename `QueueRegistry` as `DevGenericImpl` * Make `setDeviceProperties` a static method of the device
1 parent e906b1c commit 5bc1579

File tree

5 files changed

+141
-51
lines changed

5 files changed

+141
-51
lines changed

include/alpaka/dev/DevCpu.hpp

Lines changed: 19 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -6,7 +6,7 @@
66
#pragma once
77

88
#include "alpaka/dev/Traits.hpp"
9-
#include "alpaka/dev/common/QueueRegistry.hpp"
9+
#include "alpaka/dev/common/DevGenericImpl.hpp"
1010
#include "alpaka/dev/cpu/SysInfo.hpp"
1111
#include "alpaka/mem/buf/Traits.hpp"
1212
#include "alpaka/platform/Traits.hpp"
@@ -46,7 +46,7 @@ namespace alpaka
4646
namespace cpu::detail
4747
{
4848
//! The CPU device implementation.
49-
using DevCpuImpl = alpaka::detail::QueueRegistry<cpu::ICpuQueue>;
49+
using DevCpuImpl = alpaka::detail::DevGenericImpl<cpu::ICpuQueue>;
5050
} // namespace cpu::detail
5151

5252
//! The CPU device handle.
@@ -89,29 +89,42 @@ namespace alpaka
8989
return 0;
9090
}
9191

92+
static void setDeviceProperties(alpaka::DevCpu const&, alpaka::DeviceProperties& devProperties)
93+
{
94+
devProperties.name = cpu::detail::getCpuName();
95+
devProperties.totalGlobalMem = cpu::detail::getTotalGlobalMemSizeBytes();
96+
}
97+
98+
friend struct trait::GetName<DevCpu>;
99+
friend struct trait::GetMemBytes<DevCpu>;
100+
friend struct trait::GetFreeMemBytes<DevCpu>;
101+
friend struct trait::GetWarpSizes<DevCpu>;
102+
friend struct trait::GetPreferredWarpSize<DevCpu>;
103+
92104
private:
93105
std::shared_ptr<cpu::detail::DevCpuImpl> m_spDevCpuImpl;
94106
};
95107

96108
namespace trait
97109
{
110+
98111
//! The CPU device name get trait specialization.
99112
template<>
100113
struct GetName<DevCpu>
101114
{
102-
ALPAKA_FN_HOST static auto getName(DevCpu const& /* dev */) -> std::string
115+
ALPAKA_FN_HOST static auto getName(DevCpu const& dev) -> std::string
103116
{
104-
return cpu::detail::getCpuName();
117+
return dev.m_spDevCpuImpl->deviceProperties(dev)->name;
105118
}
106119
};
107120

108121
//! The CPU device available memory get trait specialization.
109122
template<>
110123
struct GetMemBytes<DevCpu>
111124
{
112-
ALPAKA_FN_HOST static auto getMemBytes(DevCpu const& /* dev */) -> std::size_t
125+
ALPAKA_FN_HOST static auto getMemBytes(DevCpu const& dev) -> std::size_t
113126
{
114-
return cpu::detail::getTotalGlobalMemSizeBytes();
127+
return dev.m_spDevCpuImpl->deviceProperties(dev)->totalGlobalMem;
115128
}
116129
};
117130

include/alpaka/dev/DevGenericSycl.hpp

Lines changed: 36 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -9,6 +9,7 @@
99
#include "alpaka/core/Common.hpp"
1010
#include "alpaka/core/Sycl.hpp"
1111
#include "alpaka/dev/Traits.hpp"
12+
#include "alpaka/dev/common/DeviceProperties.hpp"
1213
#include "alpaka/mem/buf/Traits.hpp"
1314
#include "alpaka/platform/Traits.hpp"
1415
#include "alpaka/queue/Properties.hpp"
@@ -112,11 +113,41 @@ namespace alpaka
112113
return m_context;
113114
}
114115

116+
auto deviceProperties() -> std::optional<alpaka::DeviceProperties>&
117+
{
118+
std::call_once(
119+
m_onceFlag,
120+
[&]()
121+
{
122+
m_deviceProperties = std::make_optional<alpaka::DeviceProperties>();
123+
auto const& device = this->get_device();
124+
m_deviceProperties->name = device.template get_info<sycl::info::device::name>();
125+
m_deviceProperties->totalGlobalMem
126+
= device.template get_info<sycl::info::device::global_mem_size>();
127+
128+
std::vector<std::size_t> warp_sizes
129+
= device.template get_info<sycl::info::device::sub_group_sizes>();
130+
// The CPU runtime supports a sub-group size of 64, but the SYCL implementation currently
131+
// does not
132+
auto find64 = std::find(warp_sizes.begin(), warp_sizes.end(), 64);
133+
if(find64 != warp_sizes.end())
134+
warp_sizes.erase(find64);
135+
// Sort the warp sizes in decreasing order
136+
std::sort(warp_sizes.begin(), warp_sizes.end(), std::greater<>{});
137+
m_deviceProperties->warpSizes = std::move(warp_sizes);
138+
m_deviceProperties->preferredWarpSize = m_deviceProperties->warpSizes.front();
139+
});
140+
141+
return m_deviceProperties;
142+
}
143+
115144
private:
116145
sycl::device m_device;
117146
sycl::context m_context;
118147
std::vector<std::weak_ptr<QueueGenericSyclImpl>> m_queues;
148+
std::optional<alpaka::DeviceProperties> m_deviceProperties;
119149
std::shared_mutex mutable m_mutex;
150+
std::once_flag m_onceFlag;
120151
};
121152
} // namespace detail
122153

@@ -154,14 +185,14 @@ namespace alpaka
154185

155186
namespace trait
156187
{
188+
157189
//! The SYCL device name get trait specialization.
158190
template<concepts::Tag TTag>
159191
struct GetName<DevGenericSycl<TTag>>
160192
{
161193
static auto getName(DevGenericSycl<TTag> const& dev) -> std::string
162194
{
163-
auto const device = dev.getNativeHandle().first;
164-
return device.template get_info<sycl::info::device::name>();
195+
return dev.m_impl->deviceProperties()->name;
165196
}
166197
};
167198

@@ -171,8 +202,7 @@ namespace alpaka
171202
{
172203
static auto getMemBytes(DevGenericSycl<TTag> const& dev) -> std::size_t
173204
{
174-
auto const device = dev.getNativeHandle().first;
175-
return device.template get_info<sycl::info::device::global_mem_size>();
205+
return dev.m_impl->deviceProperties()->totalGlobalMem;
176206
}
177207
};
178208

@@ -195,15 +225,7 @@ namespace alpaka
195225
{
196226
static auto getWarpSizes(DevGenericSycl<TTag> const& dev) -> std::vector<std::size_t>
197227
{
198-
auto const device = dev.getNativeHandle().first;
199-
std::vector<std::size_t> warp_sizes = device.template get_info<sycl::info::device::sub_group_sizes>();
200-
// The CPU runtime supports a sub-group size of 64, but the SYCL implementation currently does not
201-
auto find64 = std::find(warp_sizes.begin(), warp_sizes.end(), 64);
202-
if(find64 != warp_sizes.end())
203-
warp_sizes.erase(find64);
204-
// Sort the warp sizes in decreasing order
205-
std::sort(warp_sizes.begin(), warp_sizes.end(), std::greater<>{});
206-
return warp_sizes;
228+
return dev.m_impl->deviceProperties()->warpSizes;
207229
}
208230
};
209231

@@ -213,7 +235,7 @@ namespace alpaka
213235
{
214236
static auto getPreferredWarpSize(DevGenericSycl<TTag> const& dev) -> std::size_t
215237
{
216-
return GetWarpSizes<DevGenericSycl<TTag>>::getWarpSizes(dev).front();
238+
return dev.m_impl->deviceProperties()->preferredWarpSize;
217239
}
218240
};
219241

include/alpaka/dev/DevUniformCudaHipRt.hpp

Lines changed: 47 additions & 30 deletions
Original file line numberDiff line numberDiff line change
@@ -10,7 +10,8 @@
1010
#include "alpaka/core/Hip.hpp"
1111
#include "alpaka/core/Interface.hpp"
1212
#include "alpaka/dev/Traits.hpp"
13-
#include "alpaka/dev/common/QueueRegistry.hpp"
13+
#include "alpaka/dev/common/DevGenericImpl.hpp"
14+
#include "alpaka/dev/common/DeviceProperties.hpp"
1415
#include "alpaka/mem/buf/Traits.hpp"
1516
#include "alpaka/platform/Traits.hpp"
1617
#include "alpaka/queue/Properties.hpp"
@@ -20,13 +21,18 @@
2021
#include "alpaka/wait/Traits.hpp"
2122

2223
#include <cstddef>
24+
#include <mutex>
2325
#include <string>
2426
#include <vector>
2527

2628
#if defined(ALPAKA_ACC_GPU_CUDA_ENABLED) || defined(ALPAKA_ACC_GPU_HIP_ENABLED)
2729

2830
namespace alpaka
2931
{
32+
33+
template<typename TApi>
34+
class DevUniformCudaHipRt;
35+
3036
namespace trait
3137
{
3238
template<typename TPlatform, typename TSfinae>
@@ -62,7 +68,7 @@ namespace alpaka
6268
using IDeviceQueue = uniform_cuda_hip::detail::QueueUniformCudaHipRtImpl<TApi>;
6369

6470
protected:
65-
DevUniformCudaHipRt() : m_QueueRegistry{std::make_shared<alpaka::detail::QueueRegistry<IDeviceQueue>>()}
71+
DevUniformCudaHipRt() : m_DevGenericImpl{std::make_shared<alpaka::detail::DevGenericImpl<IDeviceQueue>>()}
6672
{
6773
}
6874

@@ -84,42 +90,68 @@ namespace alpaka
8490

8591
[[nodiscard]] ALPAKA_FN_HOST auto getAllQueues() const -> std::vector<std::shared_ptr<IDeviceQueue>>
8692
{
87-
return m_QueueRegistry->getAllExistingQueues();
93+
return m_DevGenericImpl->getAllExistingQueues();
8894
}
8995

9096
//! Registers the given queue on this device.
9197
//! NOTE: Every queue has to be registered for correct functionality of device wait operations!
9298
ALPAKA_FN_HOST auto registerQueue(std::shared_ptr<IDeviceQueue> spQueue) const -> void
9399
{
94-
m_QueueRegistry->registerQueue(spQueue);
100+
m_DevGenericImpl->registerQueue(spQueue);
101+
}
102+
103+
static void setDeviceProperties(
104+
DevUniformCudaHipRt<TApi> const& device,
105+
alpaka::DeviceProperties& devProperties)
106+
{
107+
// There is cuda/hip-DeviceGetAttribute as faster alternative to
108+
// cuda/hip-GetDeviceProperties to get a single device property but it has no option to get
109+
// the name
110+
auto devHandle = device.getNativeHandle();
111+
typename TApi::DeviceProp_t devProp;
112+
ALPAKA_UNIFORM_CUDA_HIP_RT_CHECK(TApi::getDeviceProperties(&devProp, devHandle));
113+
devProperties.name = std::string(devProp.name);
114+
115+
std::size_t freeInternal(0u);
116+
std::size_t totalInternal(0u);
117+
ALPAKA_UNIFORM_CUDA_HIP_RT_CHECK(TApi::memGetInfo(&freeInternal, &totalInternal));
118+
devProperties.totalGlobalMem = totalInternal;
119+
120+
int warpSize = 0;
121+
ALPAKA_UNIFORM_CUDA_HIP_RT_CHECK(
122+
TApi::deviceGetAttribute(&warpSize, TApi::deviceAttributeWarpSize, devHandle));
123+
devProperties.warpSizes = std::vector<std::size_t>{static_cast<std::size_t>(warpSize)};
124+
devProperties.preferredWarpSize = static_cast<std::size_t>(warpSize);
95125
}
96126

127+
friend struct trait::GetName<DevUniformCudaHipRt<TApi>>;
128+
friend struct trait::GetMemBytes<DevUniformCudaHipRt<TApi>>;
129+
friend struct trait::GetFreeMemBytes<DevUniformCudaHipRt<TApi>>;
130+
friend struct trait::GetWarpSizes<DevUniformCudaHipRt<TApi>>;
131+
friend struct trait::GetPreferredWarpSize<DevUniformCudaHipRt<TApi>>;
132+
97133
private:
98134
DevUniformCudaHipRt(int iDevice)
99135
: m_iDevice(iDevice)
100-
, m_QueueRegistry(std::make_shared<alpaka::detail::QueueRegistry<IDeviceQueue>>())
136+
, m_DevGenericImpl(std::make_shared<alpaka::detail::DevGenericImpl<IDeviceQueue>>())
101137
{
102138
}
103139

104140
int m_iDevice;
105141

106-
std::shared_ptr<alpaka::detail::QueueRegistry<IDeviceQueue>> m_QueueRegistry;
142+
std::shared_ptr<alpaka::detail::DevGenericImpl<IDeviceQueue>> m_DevGenericImpl;
107143
};
108144

109145
namespace trait
110146
{
147+
111148
//! The CUDA/HIP RT device name get trait specialization.
112149
template<typename TApi>
113150
struct GetName<DevUniformCudaHipRt<TApi>>
114151
{
115152
ALPAKA_FN_HOST static auto getName(DevUniformCudaHipRt<TApi> const& dev) -> std::string
116153
{
117-
// There is cuda/hip-DeviceGetAttribute as faster alternative to cuda/hip-GetDeviceProperties to get a
118-
// single device property but it has no option to get the name
119-
typename TApi::DeviceProp_t devProp;
120-
ALPAKA_UNIFORM_CUDA_HIP_RT_CHECK(TApi::getDeviceProperties(&devProp, dev.getNativeHandle()));
121-
122-
return std::string(devProp.name);
154+
return dev.m_DevGenericImpl->deviceProperties(dev)->name;
123155
}
124156
};
125157

@@ -129,15 +161,7 @@ namespace alpaka
129161
{
130162
ALPAKA_FN_HOST static auto getMemBytes(DevUniformCudaHipRt<TApi> const& dev) -> std::size_t
131163
{
132-
// Set the current device to wait for.
133-
ALPAKA_UNIFORM_CUDA_HIP_RT_CHECK(TApi::setDevice(dev.getNativeHandle()));
134-
135-
std::size_t freeInternal(0u);
136-
std::size_t totalInternal(0u);
137-
138-
ALPAKA_UNIFORM_CUDA_HIP_RT_CHECK(TApi::memGetInfo(&freeInternal, &totalInternal));
139-
140-
return totalInternal;
164+
return dev.m_DevGenericImpl->deviceProperties(dev)->totalGlobalMem;
141165
}
142166
};
143167

@@ -147,12 +171,9 @@ namespace alpaka
147171
{
148172
ALPAKA_FN_HOST static auto getFreeMemBytes(DevUniformCudaHipRt<TApi> const& dev) -> std::size_t
149173
{
150-
// Set the current device to wait for.
151174
ALPAKA_UNIFORM_CUDA_HIP_RT_CHECK(TApi::setDevice(dev.getNativeHandle()));
152-
153175
std::size_t freeInternal(0u);
154176
std::size_t totalInternal(0u);
155-
156177
ALPAKA_UNIFORM_CUDA_HIP_RT_CHECK(TApi::memGetInfo(&freeInternal, &totalInternal));
157178

158179
return freeInternal;
@@ -165,7 +186,7 @@ namespace alpaka
165186
{
166187
ALPAKA_FN_HOST static auto getWarpSizes(DevUniformCudaHipRt<TApi> const& dev) -> std::vector<std::size_t>
167188
{
168-
return {GetPreferredWarpSize<DevUniformCudaHipRt<TApi>>::getPreferredWarpSize(dev)};
189+
return dev.m_DevGenericImpl->deviceProperties(dev)->warpSizes;
169190
}
170191
};
171192

@@ -175,11 +196,7 @@ namespace alpaka
175196
{
176197
ALPAKA_FN_HOST static auto getPreferredWarpSize(DevUniformCudaHipRt<TApi> const& dev) -> std::size_t
177198
{
178-
int warpSize = 0;
179-
180-
ALPAKA_UNIFORM_CUDA_HIP_RT_CHECK(
181-
TApi::deviceGetAttribute(&warpSize, TApi::deviceAttributeWarpSize, dev.getNativeHandle()));
182-
return static_cast<std::size_t>(warpSize);
199+
return dev.m_DevGenericImpl->deviceProperties(dev)->preferredWarpSize;
183200
}
184201
};
185202

include/alpaka/dev/common/QueueRegistry.hpp renamed to include/alpaka/dev/common/DevGenericImpl.hpp

Lines changed: 20 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -5,19 +5,22 @@
55
#pragma once
66

77
#include "alpaka/core/Common.hpp"
8+
#include "alpaka/dev/common/DeviceProperties.hpp"
89

910
#include <deque>
1011
#include <functional>
1112
#include <memory>
1213
#include <mutex>
14+
#include <optional>
1315

1416
namespace alpaka::detail
1517
{
18+
1619
//! The CPU/GPU device queue registry implementation.
1720
//!
1821
//! @tparam TQueue queue implementation
1922
template<typename TQueue>
20-
struct QueueRegistry
23+
struct DevGenericImpl
2124
{
2225
ALPAKA_FN_HOST auto getAllExistingQueues() const -> std::vector<std::shared_ptr<TQueue>>
2326
{
@@ -52,8 +55,24 @@ namespace alpaka::detail
5255
m_queues.push_back(spQueue);
5356
}
5457

58+
template<typename TDev>
59+
auto deviceProperties(TDev const& device) -> std::optional<alpaka::DeviceProperties>&
60+
{
61+
std::call_once(
62+
m_onceFlag,
63+
[&]() noexcept
64+
{
65+
m_deviceProperties = std::make_optional<alpaka::DeviceProperties>();
66+
TDev::setDeviceProperties(device, *m_deviceProperties);
67+
});
68+
69+
return m_deviceProperties;
70+
}
71+
5572
private:
5673
std::mutex mutable m_Mutex;
74+
std::once_flag m_onceFlag;
75+
std::optional<alpaka::DeviceProperties> m_deviceProperties;
5776
std::deque<std::weak_ptr<TQueue>> mutable m_queues;
5877
};
5978
} // namespace alpaka::detail

0 commit comments

Comments
 (0)