@@ -161,51 +161,52 @@ namespace amrex
161161 AMREX_GPU_HOST_DEVICE
162162 ItType upper_bound (ItType first, ItType last, const ValType& val)
163163 {
164- #if AMREX_DEVICE_COMPILE
165- std::ptrdiff_t count = last-first;
166- while (count>0 ){
167- auto it = first;
168- const auto step = count/2 ;
169- it += step;
170- if (!(val < *it)){
171- first = ++it;
172- count -= step + 1 ;
164+ AMREX_IF_ON_DEVICE ((
165+ std::ptrdiff_t count = last-first;
166+ while (count>0 ){
167+ auto it = first;
168+ const auto step = count/2 ;
169+ it += step;
170+ if (!(val < *it)){
171+ first = ++it;
172+ count -= step + 1 ;
173+ }
174+ else {
175+ count = step;
176+ }
173177 }
174- else {
175- count = step;
176- }
177- }
178-
179- return first;
180- #else
181- return std::upper_bound (first, last, val);
182- #endif
178+ return first;
179+ ))
180+ AMREX_IF_ON_HOST ((
181+ return std::upper_bound (first, last, val);
182+ ))
183183 }
184184
185185 template <typename ItType, typename ValType>
186186 AMREX_GPU_HOST_DEVICE
187187 ItType lower_bound (ItType first, ItType last, const ValType& val)
188188 {
189- #ifdef AMREX_DEVICE_COMPILE
190- std::ptrdiff_t count = last-first;
191- while (count>0 )
192- {
193- auto it = first;
194- const auto step = count/2 ;
195- it += step;
196- if (*it < val){
197- first = ++it;
198- count -= step + 1 ;
199- }
200- else {
201- count = step;
189+ AMREX_IF_ON_DEVICE ((
190+ std::ptrdiff_t count = last-first;
191+ while (count>0 )
192+ {
193+ auto it = first;
194+ const auto step = count/2 ;
195+ it += step;
196+ if (*it < val){
197+ first = ++it;
198+ count -= step + 1 ;
199+ }
200+ else {
201+ count = step;
202+ }
202203 }
203- }
204204
205- return first;
206- #else
207- return std::lower_bound (first, last, val);
208- #endif
205+ return first;
206+ ))
207+ AMREX_IF_ON_HOST ((
208+ return std::lower_bound (first, last, val);
209+ ))
209210 }
210211
211212namespace detail {
@@ -239,83 +240,100 @@ int builtin_clz_wrapper (clzll_tag, T x) noexcept
239240 return static_cast <int >(__builtin_clzll (x) - (sizeof (unsigned long long ) * CHAR_BIT - sizeof (T) * CHAR_BIT));
240241}
241242
242- #ifdef AMREX_USE_CUDA
243-
244- // likewise with CUDA, there are __clz functions that take (signed) int and long long int
245- template <typename T, typename = typename std::enable_if<sizeof (T) <= sizeof (int )>::type>
246- AMREX_GPU_DEVICE AMREX_FORCE_INLINE
247- int clz_wrapper (clz_tag, T x) noexcept
248- {
249- return __clz ((int ) x) - (sizeof (int ) * CHAR_BIT - sizeof (T) * CHAR_BIT);
250- }
251-
252- template <typename T, typename = typename std::enable_if<sizeof (T) <= sizeof (long long int )>::type>
253- AMREX_GPU_DEVICE AMREX_FORCE_INLINE
254- int clz_wrapper (clzll_tag, T x) noexcept
255- {
256- return __clzll ((long long int ) x) - (sizeof (long long int ) * CHAR_BIT - sizeof (T) * CHAR_BIT);
257243}
258- #endif
259244
260- }
245+ template <class T , typename std::enable_if_t <std::is_same_v<std::decay_t <T>,std::uint8_t > ||
246+ std::is_same_v<std::decay_t <T>,std::uint16_t > ||
247+ std::is_same_v<std::decay_t <T>,std::uint32_t > ||
248+ std::is_same_v<std::decay_t <T>,std::uint64_t >, int > = 0 >
249+ AMREX_GPU_HOST_DEVICE AMREX_FORCE_INLINE
250+ int clz (T x) noexcept ;
261251
262252AMREX_GPU_HOST_DEVICE AMREX_FORCE_INLINE
263- int clz (std::uint8_t x) noexcept
253+ int clz_generic (std::uint8_t x) noexcept
264254{
265- #if (AMREX_DEVICE_COMPILE && defined(AMREX_USE_CUDA)) // all supported cuda versions have __clz
266- return detail::clz_wrapper (detail::clz_tag{}, x);
267- #elif (!AMREX_DEVICE_COMPILE && AMREX_HAS_BUILTIN_CLZ)
268- return detail::builtin_clz_wrapper (detail::clz_tag{}, x);
269- #else
270255 static constexpr int clz_lookup[16 ] = { 4 , 3 , 2 , 2 , 1 , 1 , 1 , 1 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 };
271256 auto upper = x >> 4 ;
272257 auto lower = x & 0xF ;
273258 return upper ? clz_lookup[upper] : 4 + clz_lookup[lower];
274- #endif
275259}
276260
277261AMREX_GPU_HOST_DEVICE AMREX_FORCE_INLINE
278- int clz (std::uint16_t x) noexcept
262+ int clz_generic (std::uint16_t x) noexcept
279263{
280- #if (AMREX_DEVICE_COMPILE && defined(AMREX_USE_CUDA)) // all supported cuda versions have __clz
281- return detail::clz_wrapper (detail::clz_tag{}, x);
282- #elif (!AMREX_DEVICE_COMPILE && AMREX_HAS_BUILTIN_CLZ)
283- return detail::builtin_clz_wrapper (detail::clz_tag{}, x);
284- #else
285264 auto upper = std::uint8_t (x >> 8 );
286265 auto lower = std::uint8_t (x & 0xFF );
287266 return upper ? clz (upper) : 8 + clz (lower);
288- #endif
289267}
290268
291269AMREX_GPU_HOST_DEVICE AMREX_FORCE_INLINE
292- int clz (std::uint32_t x) noexcept
270+ int clz_generic (std::uint32_t x) noexcept
293271{
294- #if (AMREX_DEVICE_COMPILE && defined(AMREX_USE_CUDA)) // all supported cuda versions have __clz
295- return detail::clz_wrapper (detail::clz_tag{}, x);
296- #elif (!AMREX_DEVICE_COMPILE && AMREX_HAS_BUILTIN_CLZ)
297- return detail::builtin_clz_wrapper (detail::clz_tag{}, x);
298- #else
299272 auto upper = std::uint16_t (x >> 16 );
300273 auto lower = std::uint16_t (x & 0xFFFF );
301274 return upper ? clz (upper) : 16 + clz (lower);
302- #endif
303275}
304276
305277AMREX_GPU_HOST_DEVICE AMREX_FORCE_INLINE
306- int clz (std::uint64_t x) noexcept
278+ int clz_generic (std::uint64_t x) noexcept
307279{
308- #if (AMREX_DEVICE_COMPILE && defined(AMREX_USE_CUDA)) // all supported cuda versions have __clz
309- return detail::clz_wrapper (detail::clz_tag{}, x);
310- #elif (!AMREX_DEVICE_COMPILE && AMREX_HAS_BUILTIN_CLZ)
311- return detail::builtin_clz_wrapper (detail::clz_tag{}, x);
312- #else
313280 auto upper = std::uint32_t (x >> 32 );
314281 auto lower = std::uint32_t (x & 0xFFFFFFFF );
315282 return upper ? clz (upper) : 32 + clz (lower);
283+ }
284+
285+ #if defined AMREX_USE_CUDA
286+
287+ namespace detail {
288+ // likewise with CUDA, there are __clz functions that take (signed) int and long long int
289+ template <typename T, typename = typename std::enable_if<sizeof (T) <= sizeof (int )>::type>
290+ AMREX_GPU_DEVICE AMREX_FORCE_INLINE
291+ int clz_wrapper (clz_tag, T x) noexcept
292+ {
293+ return __clz ((int ) x) - (sizeof (int ) * CHAR_BIT - sizeof (T) * CHAR_BIT);
294+ }
295+
296+ template <typename T, typename = typename std::enable_if<sizeof (T) <= sizeof (long long int )>::type>
297+ AMREX_GPU_DEVICE AMREX_FORCE_INLINE
298+ int clz_wrapper (clzll_tag, T x) noexcept
299+ {
300+ return __clzll ((long long int ) x) - (sizeof (long long int ) * CHAR_BIT - sizeof (T) * CHAR_BIT);
301+ }
302+ }
303+
304+ template <class T , typename std::enable_if_t <std::is_same_v<std::decay_t <T>,std::uint8_t > ||
305+ std::is_same_v<std::decay_t <T>,std::uint16_t > ||
306+ std::is_same_v<std::decay_t <T>,std::uint32_t > ||
307+ std::is_same_v<std::decay_t <T>,std::uint64_t >, int > >
308+ AMREX_GPU_HOST_DEVICE AMREX_FORCE_INLINE
309+ int clz (T x) noexcept
310+ {
311+ AMREX_IF_ON_DEVICE ((return detail::clz_wrapper (detail::clz_tag{}, x);))
312+ #if AMREX_HAS_BUILTIN_CLZ
313+ AMREX_IF_ON_HOST ((return detail::builtin_clz_wrapper (detail::clz_tag{}, x);))
314+ #else
315+ AMREX_IF_ON_HOST ((return clz_generic (x);))
316316#endif
317317}
318318
319+ #else // !defined AMREX_USE_CUDA
320+
321+ template <class T , typename std::enable_if_t <std::is_same_v<std::decay_t <T>,std::uint8_t > ||
322+ std::is_same_v<std::decay_t <T>,std::uint16_t > ||
323+ std::is_same_v<std::decay_t <T>,std::uint32_t > ||
324+ std::is_same_v<std::decay_t <T>,std::uint64_t >, int > >
325+ AMREX_GPU_HOST_DEVICE AMREX_FORCE_INLINE
326+ int clz (T x) noexcept
327+ {
328+ #if (!AMREX_DEVICE_COMPILE && AMREX_HAS_BUILTIN_CLZ)
329+ return detail::builtin_clz_wrapper (detail::clz_tag{}, x);
330+ #else
331+ return clz_generic (x);
332+ #endif
333+ }
334+
335+ #endif // defined AMREX_USE_CUDA
336+
319337}
320338
321339#endif
0 commit comments