owensgroup
diff --git a/‎include/rxmesh/arg_ops.h‎
Lines changed: 47 additions & 0 deletions b/‎include/rxmesh/arg_ops.h‎
Lines changed: 47 additions & 0 deletions
diff --git a/‎include/rxmesh/cavity_manager_impl.cuh‎
Lines changed: 1 addition & 1 deletion b/‎include/rxmesh/cavity_manager_impl.cuh‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎include/rxmesh/kernels/attribute.cuh‎
Lines changed: 43 additions & 73 deletions b/‎include/rxmesh/kernels/attribute.cuh‎
Lines changed: 43 additions & 73 deletions
@@ -0,0 +1,47 @@
+#pragma once
+
+#include <limits>
+
+namespace rxmesh {
+
+template <typename HandleT, typename T>
+using KeyValuePair = cub::KeyValuePair<HandleT, T>;
+
+namespace detail {
+
+template <typename HandleT, typename T>
+struct ArgMaxOp
+{
+    constexpr T default_val() const
+    {
+        return std::numeric_limits<T>::lowest();
+    }
+
+    __device__ __forceinline__ KeyValuePair<HandleT, T> operator()(
+        const KeyValuePair<HandleT, T>& a,
+        const KeyValuePair<HandleT, T>& b) const
+    {
+        return (b.value > a.value) ? b : a;
+    }
+};
+
+
+template <typename HandleT, typename T>
+struct ArgMinOp
+{
+    constexpr T default_val() const
+    {
+        return std::numeric_limits<T>::max();
+    }
+
+    __device__ __forceinline__ KeyValuePair<HandleT, T> operator()(
+        const KeyValuePair<HandleT, T>& a,
+        const KeyValuePair<HandleT, T>& b) const
+    {
+        return (b.value < a.value) ? b : a;
+    }
+};
+
+
+}  // namespace detail
+}  // namespace rxmesh
@@ -3826,7 +3826,7 @@ CavityManager<blockThreads, cop>::populate_correspondence(
             const LPPair lp =
                 m_patch_info.get_lp<HandleT>().find(b, s_table, s_stash);
 
-            assert(lp.local_id == b);
+            assert(lp.local_id() == b);
 
             // inner
             for (int c = 0; c < q_num_elements; ++c) {
 
@@ -2,6 +2,7 @@
 #include <cub/block/block_reduce.cuh>
 #include "rxmesh/util/macros.h"
 
+#include "rxmesh/arg_ops.h"
 
 namespace rxmesh {
 
@@ -10,7 +11,7 @@ class Attribute;
 
 namespace detail {
 
-template <class T, uint32_t blockSize>
+template <uint32_t blockSize, class T>
 __device__ __forceinline__ void cub_block_sum(const T thread_val,
                                               T*      d_block_output)
 {
@@ -22,6 +23,23 @@ __device__ __forceinline__ void cub_block_sum(const T thread_val,
     }
 }
 
+template <uint32_t blockSize, class T, typename ReductionOp>
+__device__ __forceinline__ void cub_block_reduce(const T     thread_val,
+                                                 T*          d_block_output,
+                                                 ReductionOp reduction_op)
+{
+    typedef cub::BlockReduce<T, blockSize> BlockReduce;
+
+    __shared__ typename BlockReduce::TempStorage temp_storage;
+
+    T block_aggregate =
+        BlockReduce(temp_storage).Reduce(thread_val, reduction_op);
+
+    if (threadIdx.x == 0) {
+        d_block_output[blockIdx.x] = block_aggregate;
+    }
+}
+
 template <class T, uint32_t blockSize, typename HandleT>
 __launch_bounds__(blockSize) __global__
     void norm2_kernel(const Attribute<T, HandleT> X,
@@ -52,7 +70,7 @@ __launch_bounds__(blockSize) __global__
             }
         }
 
-        cub_block_sum<T, blockSize>(thread_val, d_block_output);
+        cub_block_sum<blockSize>(thread_val, d_block_output);
     }
 }
 
@@ -90,95 +108,53 @@ __launch_bounds__(blockSize) __global__
             }
         }
 
-        cub_block_sum<T, blockSize>(thread_val, d_block_output);
+        cub_block_sum<blockSize>(thread_val, d_block_output);
     }
 }
 
-template <typename HandleT, typename T>
-struct CustomMaxPair
-{
-    __host__ __device__ CustomMaxPair()
-    {
-        default_val = (std::numeric_limits<T>::lowest());
-    }
-
-    __device__ __forceinline__ cub::KeyValuePair<HandleT, T> operator()(
-        const cub::KeyValuePair<HandleT, T>& a,
-        const cub::KeyValuePair<HandleT, T>& b) const
-    {
-        return (b.value > a.value) ? b : a;
-    }
-    T default_val;
-};
-
-template <typename HandleT, typename T>
-struct CustomMinPair
-{
-    __host__ __device__ CustomMinPair()
-    {
-        default_val = (std::numeric_limits<T>::max());
-    }
-    __device__ __forceinline__ cub::KeyValuePair<HandleT, T> operator()(
-        const cub::KeyValuePair<HandleT, T>& a,
-        const cub::KeyValuePair<HandleT, T>& b) const
-    {
-        return (b.value < a.value) ? b : a;
-    }
-    T default_val;
-};
-
 template <class T, uint32_t blockSize, typename HandleT, typename Operation>
 __launch_bounds__(blockSize) __global__
     void arg_minmax_kernel(const Attribute<T, HandleT> X,
-                    uint32_t                    attribute_id,
-                    Operation                   op, //can be either max or min operation
-                    const uint32_t              num_patches,
-                    const uint32_t              num_attributes,
-                    cub::KeyValuePair<HandleT, T>*  d_block_output)
+                           uint32_t                    attribute_id,
+                           Operation                   reduction_op,
+                           const uint32_t              num_patches,
+                           const uint32_t              num_attributes,
+                           KeyValuePair<HandleT, T>*   d_block_output)
 {
     using LocalT = typename HandleT::LocalT;
 
-    assert(X.get_num_attributes() == 1); //we can only take arg max for a scalar attribute
-
     uint32_t p_id = blockIdx.x;
     if (p_id < num_patches) {
-        const uint16_t element_per_patch = X.size(p_id);
-        cub::KeyValuePair<HandleT, T> thread_val;
-        thread_val.value = op.default_val;
+        const uint16_t           element_per_patch = X.size(p_id);
+        KeyValuePair<HandleT, T> thread_val;
+        thread_val.value = reduction_op.default_val();
         thread_val.key   = HandleT(p_id, threadIdx.x);
         for (uint16_t i = threadIdx.x; i < element_per_patch; i += blockSize) {
 
             if (X.get_patch_info(p_id).is_owned(LocalT(i)) &&
                 !X.get_patch_info(p_id).is_deleted(LocalT(i))) {
 
-                if (attribute_id != INVALID32 ) 
-                {
-                    HandleT handle(p_id, i);
-                    cub::KeyValuePair<HandleT, T> current_pair(handle, X(p_id, i, attribute_id));
-                    thread_val = op(thread_val, current_pair);
-                }
-                else {
-                    for (uint32_t j = 0; j < num_attributes; ++j) 
-                    {
-                        HandleT handle(p_id, i);
-                        cub::KeyValuePair<HandleT, T> current_pair(handle, X(p_id, i, j));
-                        thread_val = op(thread_val, current_pair);
+                if (attribute_id != INVALID32) {
+                    HandleT                  handle(p_id, i);
+                    KeyValuePair<HandleT, T> current_pair(
+                        handle, X(p_id, i, attribute_id));
+                    thread_val = reduction_op(thread_val, current_pair);
+                } else {
+                    for (uint32_t j = 0; j < num_attributes; ++j) {
+                        HandleT                  handle(p_id, i);
+                        KeyValuePair<HandleT, T> current_pair(handle,
+                                                              X(p_id, i, j));
+                        thread_val = reduction_op(thread_val, current_pair);
                     }
                 }
             }
         }
-        typedef cub::BlockReduce<cub::KeyValuePair<HandleT, T>, blockSize> BlockReduce;
-        __shared__ typename BlockReduce::TempStorage temp_storage;
-        cub::KeyValuePair<HandleT, T> block_aggregate = BlockReduce(temp_storage).Reduce(thread_val, op);
-        if (threadIdx.x == 0) 
-        {
-            d_block_output[blockIdx.x] = block_aggregate;
-        }
+
+        cub_block_reduce<blockSize>(thread_val, d_block_output, reduction_op);
     }
 }
 
 
-
 template <class T, uint32_t blockSize, typename ReductionOp, typename HandleT>
 __launch_bounds__(blockSize) __global__
     void generic_reduce(const Attribute<T, HandleT> X,
@@ -209,14 +185,8 @@ __launch_bounds__(blockSize) __global__
                 }
             }
         }
-        typedef cub::BlockReduce<T, blockSize>       BlockReduce;
-        __shared__ typename BlockReduce::TempStorage temp_storage;
 
-        T block_aggregate =
-            BlockReduce(temp_storage).Reduce(thread_val, reduction_op);
-        if (threadIdx.x == 0) {
-            d_block_output[blockIdx.x] = block_aggregate;
-        }
+        cub_block_reduce<blockSize>(thread_val, d_block_output, reduction_op);
     }
 }