88namespace H2O4GPU {
99namespace KMeans {
1010
11- namespace kernel {
12-
13- /*
14- * Compute min value for each row.
15- * @tparam T Numeric type of the data
16- * @param _res The output matrix with shape m x 1
17- * @param _val The input matrix with shape m x n
18- */
19- template <typename T>
20- __global__ void row_min_sequential (kParam <T> _res, kParam <T> _val) {
21-
22- size_t idx = global_thread_idx ();
23- if (idx < _val.rows ) {
24- T min = std::numeric_limits<T>::max ();
25- for (size_t i = 0 ; i < _val.cols ; ++i) {
26- T value = _val.ptr [idx * _val.cols + i];
27- if (value < min) {
28- min = value;
29- }
30- }
31- _res.ptr [idx] = min;
32- }
33- }
34-
35- template <typename T>
36- __global__ void row_argmin_sequential (kParam <int > _res, kParam <T> _val) {
37-
38- size_t idx = global_thread_idx ();
39- if (idx < _val.rows ) {
40- T min = std::numeric_limits<T>::max ();
41- int min_idx = -1 ;
42- for (size_t i = 0 ; i < _val.cols ; ++i) {
43- T value = _val.ptr [idx * _val.cols + i];
44- if (value < min) {
45- min = value;
46- min_idx = i;
47- }
48- }
49- _res.ptr [idx] = min_idx;
50- }
51- }
52-
53- } // namespace kernel
54-
5511// FIXME: Using struct for operations is just keeping the possibility of
5612// creating an unified operations for KmMatrix. For example, let KmMatrix
5713// inherit those left associative ops, or create an inferface for elementwise
@@ -60,110 +16,40 @@ __global__ void row_argmin_sequential(kParam<int> _res, kParam<T> _val) {
6016// FIXME: Use return value instead.
6117template <typename T>
6218struct DotOp {
63- void dot (KmMatrix<T>& _res, KmMatrix<T>& _val) {
64- this ->dot (_res, _val, _val);
65- }
66- void dot (KmMatrix<T>& _res, KmMatrix<T>& _lhs,
67- KmMatrix<T>& _rhs) {
68- constexpr T alpha = 1.0 ;
69- constexpr T beta = 1.0 ;
70- cublasHandle_t handle = GpuInfo::ins ().cublas_handle ();
71- Blas::gemm (handle,
72- CUBLAS_OP_N, CUBLAS_OP_N, // FIXME
73- _lhs.rows (), _rhs.cols (), _lhs.cols (),
74- &alpha,
75- _lhs.dev_ptr (), _lhs.cols (),
76- _rhs.dev_ptr (), _rhs.cols (),
77- &beta,
78- _res.dev_ptr (), _res.cols ());
79- }
19+ void dot (KmMatrix<T>& _res, KmMatrix<T>& _val);
20+ void dot (KmMatrix<T>& _res, KmMatrix<T>& _lhs, KmMatrix<T>& _rhs);
8021};
8122
8223template <typename T>
8324struct VecBatchDotOp {
84- void dot (KmMatrix<T>& _res, KmMatrix<T>& _val) {
85- this ->dot (_res, _val, _val);
86- }
87- void dot (KmMatrix<T>& _res, KmMatrix<T>& _lhs, KmMatrix<T>& _rhs) {
88- constexpr T alpha = 1.0 ;
89- constexpr T beta = 1.0 ;
90- cublasHandle_t handle = GpuInfo::ins ().cublas_handle ();
91- Blas::gemm_strided_batched (
92- handle,
93- CUBLAS_OP_N, CUBLAS_OP_T,
94- 1 , 1 , _rhs.cols (), // m, n, k
95- &alpha,
96- _lhs.dev_ptr (), 1 , _lhs.cols (),
97- _rhs.dev_ptr (), 1 , _rhs.cols (),
98- &beta,
99- _res.dev_ptr (), _res.cols (), 1 , // c should be columun vector
100- _lhs.rows ());
101- }
25+ void dot (KmMatrix<T>& _res, KmMatrix<T>& _val);
26+ void dot (KmMatrix<T>& _res, KmMatrix<T>& _lhs, KmMatrix<T>& _rhs);
10227};
10328
10429template <typename T>
10530struct SumOp {
106- T sum (KmMatrix<T>& _val) {
107- T* raw_ptr = _val.dev_ptr ();
108- thrust::device_ptr<T> ptr (raw_ptr);
109- T res = thrust::reduce (ptr, ptr + _val.size (), (T)0 , thrust::plus<T>());
110- return res;
111- }
31+ T sum (KmMatrix<T>& _val);
11232};
11333
11434template <typename T>
11535struct MulOp {
116- void mul (KmMatrix<T>& _res, KmMatrix<T>& _lhs, T _rhs) {
117- cublasHandle_t handle = GpuInfo::ins ().cublas_handle ();
118- Blas::axpy (
119- handle, _lhs.size (), // handle, n
120- &_rhs, // alpha
121- _lhs.dev_ptr (), 1 ,
122- _res.dev_ptr (), 1 );
123- }
36+ void mul (KmMatrix<T>& _res, KmMatrix<T>& _lhs, T _rhs);
12437};
12538
12639
12740template <typename T>
12841struct MeanOp {
129- T mean (KmMatrix<T>& _val) {
130- T res = SumOp<T>().sum (_val);
131- res = res / _val.size ();
132- return res;
133- }
42+ T mean (KmMatrix<T>& _val);
13443};
13544
13645template <typename T>
13746struct ArgMinOp {
138-
139- KmMatrix<int > argmin (KmMatrix<T>& _val, KmMatrixDim _dim) {
140- if (_dim == KmMatrixDim::ROW) {
141- KmMatrix<int > _res (_val.rows (), 1 );
142- kernel::row_argmin_sequential<<<div_roundup (_val.rows (), 256 ), 256 >>>(
143- _res.k_param (), _val.k_param ());
144- return _res;
145- } else {
146- // FIXME
147- M_ERROR (" Not implemented" );
148- }
149- }
47+ KmMatrix<int > argmin (KmMatrix<T>& _val, KmMatrixDim _dim);
15048};
15149
15250template <typename T>
15351struct MinOp {
154-
155- KmMatrix<T> min (KmMatrix<T>& _val, KmMatrixDim _dim) {
156- size_t blocks = GpuInfo::ins ().blocks (32 );
157- if (_dim == KmMatrixDim::ROW) {
158- KmMatrix<T> _res (_val.rows (), 1 );
159- kernel::row_min_sequential<<<div_roundup (_val.rows (), 256 ), 256 >>>(
160- _res.k_param (), _val.k_param ());
161- return _res;
162- } else {
163- // FIXME
164- M_ERROR (" Not implemented" );
165- }
166- }
52+ KmMatrix<T> min (KmMatrix<T>& _val, KmMatrixDim _dim);
16753};
16854
16955} // namespace KMenas
0 commit comments