6
6
#include < c10/core/Scalar.h>
7
7
#include < c10/util/irange.h>
8
8
9
+ #include < sstream>
9
10
#include < type_traits>
10
11
11
- namespace at :: native { inline namespace CPU_CAPABILITY {
12
+ namespace at { namespace native { inline namespace CPU_CAPABILITY {
12
13
13
14
using namespace vec ;
14
15
@@ -33,9 +34,9 @@ inline bool is_outer_reduction(const int64_t* strides) {
33
34
strides[3 ] == sizeof (typename traits::arg2_t );
34
35
}
35
36
36
- template <typename func_t , typename vec_func_t , bool reduce >
37
+ template <typename func_t , typename vec_func_t >
37
38
inline void vectorized_reduction (char ** data, int64_t n, int64_t stride,
38
- func_t op [[maybe_unused]] , vec_func_t vop) {
39
+ func_t op, vec_func_t vop, bool reduce ) {
39
40
VEC_LOOP_HEADER (func_t , data)
40
41
const char * in1_ptr = data[1 ];
41
42
Vec acc[4 ];
@@ -49,7 +50,7 @@ inline void vectorized_reduction(char** data, int64_t n, int64_t stride,
49
50
acc[2 ] = vop (acc[2 ], Vec::loadu (ptr + (2 * Vec::size () * sizeof (scalar_t ))));
50
51
acc[3 ] = vop (acc[3 ], Vec::loadu (ptr + (3 * Vec::size () * sizeof (scalar_t ))));
51
52
}
52
- if constexpr (reduce) {
53
+ if (reduce) {
53
54
scalar_t buffer[Vec::size ()];
54
55
acc[0 ] = vop (vop (acc[0 ], acc[1 ]), vop (acc[2 ], acc[3 ]));
55
56
acc[0 ].store (buffer);
@@ -80,10 +81,10 @@ inline void UNARY_OUTER_LOOP(char* data[2], const int64_t strides[2], int64_t n,
80
81
template <typename func_t , typename vec_func_t >
81
82
inline void vectorized_inner_reduction (char ** data, int64_t n, func_t op, vec_func_t vop) {
82
83
VEC_LOOP_HEADER (func_t , data)
83
- constexpr int64_t vector_stride = 4 * Vec::size () * sizeof (scalar_t );
84
+ int64_t vector_stride = 4 * Vec::size () * sizeof (scalar_t );
84
85
int64_t count = n / (4 * Vec::size ());
85
86
if (count > 0 ) {
86
- vectorized_reduction< func_t , vec_func_t , true > (data, count, vector_stride, op, vop);
87
+ vectorized_reduction (data, count, vector_stride, op, vop, /* reduce= */ true );
87
88
}
88
89
char * ptrs[3 ] = { data[0 ], data[0 ], data[1 ] };
89
90
int64_t strides[] = { 0 , 0 , sizeof (scalar_t ) };
@@ -102,7 +103,7 @@ inline void vectorized_outer_reduction(char** data, int64_t inner_stride, int64_
102
103
int64_t outer_stride[2 ] = { 128 , 128 };
103
104
#endif
104
105
UNARY_OUTER_LOOP (data, outer_stride, size1 / (4 * Vec::size ()), [&] {
105
- vectorized_reduction< func_t , vec_func_t , false > (data, size0, inner_stride, op, vop);
106
+ vectorized_reduction (data, size0, inner_stride, op, vop, /* reduce= */ false );
106
107
});
107
108
108
109
// reduce down the remaining columns
@@ -131,13 +132,13 @@ static void set_results(const res_t result, const TensorIteratorBase &iter, cons
131
132
}
132
133
133
134
template <typename traits, std::size_t i = 0 , typename ... tuple_t >
134
- inline std::enable_if_t <i == sizeof ...(tuple_t ), std::size_t >
135
+ inline typename std::enable_if <i == sizeof ...(tuple_t ), std::size_t >::type
135
136
for_each_in_tuple (const std::tuple<tuple_t ...>& /* t*/ , const TensorIteratorBase& /* iter*/ , const int /* num_outputs*/ ) {
136
137
return i;
137
138
}
138
139
139
140
template <typename traits, std::size_t i = 0 , typename ... tuple_t >
140
- inline std::enable_if_t <i < sizeof ...(tuple_t ), std::size_t >
141
+ inline typename std::enable_if <i < sizeof ...(tuple_t ), std::size_t >::type
141
142
for_each_in_tuple (const std::tuple<tuple_t ...>& t, const TensorIteratorBase &iter, const int num_outputs) {
142
143
if (i < (size_t )num_outputs) {
143
144
set_result<traits>(i, std::get<i>(t), iter, num_outputs);
@@ -310,4 +311,4 @@ void binary_kernel_reduce_lastdim(TensorIteratorBase& iter, reduce_func_t reduce
310
311
sub_iter.for_each (loop, grain_size);
311
312
}
312
313
313
- }} // namespace at::native::<anonymous>
314
+ }}} // namespace at::native::<anonymous>
0 commit comments