h2oai
diff --git a/‎src/gpu/kmeans/KmMatrix/GpuInfo.cuh
Lines changed: 1 addition & 2 deletions b/‎src/gpu/kmeans/KmMatrix/GpuInfo.cuh
Lines changed: 1 addition & 2 deletions
diff --git a/‎src/gpu/kmeans/KmMatrix/KmMatrix.cpp
Lines changed: 1 addition & 1 deletion b/‎src/gpu/kmeans/KmMatrix/KmMatrix.cpp
Lines changed: 1 addition & 1 deletion
diff --git a/‎src/gpu/kmeans/KmMatrix/blas.cuh
Lines changed: 102 additions & 15 deletions b/‎src/gpu/kmeans/KmMatrix/blas.cuh
Lines changed: 102 additions & 15 deletions
diff --git a/‎src/gpu/kmeans/KmMatrix/utils.cuh
Lines changed: 12 additions & 6 deletions b/‎src/gpu/kmeans/KmMatrix/utils.cuh
Lines changed: 12 additions & 6 deletions
@@ -13,6 +13,7 @@
 #include <stdlib.h>
 #include <stdio.h>
 
+// Singleton class storing gpu info.
 class GpuInfo {
  private:
   int n_gpu_;
@@ -67,6 +68,4 @@ class GpuInfo {
 
 };
 
-// const GpuInfoImpl GpuInfo::impl = GpuInfoImpl();
-
 #endif  // GPU_INFO_HPP_
@@ -250,7 +250,7 @@ KmMatrix<T> KmMatrix<T>::stack(KmMatrix<T> &_second,
 
 template <typename T>
 std::ostream& operator<<(std::ostream& os, KmMatrix<T>& m) {
-  std::cout << "matrix: " << m.name() << std::endl << "---" << std::endl;
+  std::cout << "\nmatrix: " << m.name() << std::endl << "---" << std::endl;
   T * ptr = m.host_ptr();
   kParam<T> param = m.k_param();
   for (size_t i = 0; i < param.rows; ++i) {
 
@@ -17,9 +17,9 @@ namespace KMeans {
 namespace Blas {
 // LEVEL 1
 inline void axpy(cublasHandle_t handle, int n,
-                           const float *alpha,
-                           const float *x, int incx,
-                           float *y, int incy) {
+                 const float *alpha,
+                 const float *x, int incx,
+                 float *y, int incy) {
   CUBLAS_CHECK(cublasSaxpy(handle, n,
                            alpha,
                            x, incx,
@@ -50,19 +50,13 @@ inline void gemm(cublasHandle_t handle,
                  float *C,
                  int ldc) {
   CUBLAS_CHECK(cublasSgemm(handle,
-                           transa,
-                           transb,
-                           m,
-                           n,
-                           k,
+                           transa, transb,
+                           m, n, k,
                            alpha, /* host or device pointer */
-                           A,
-                           lda,
-                           B,
-                           ldb,
+                           A, lda,
+                           B, ldb,
                            beta, /* host or device pointer */
-                           C,
-                           ldc));}
+                           C, ldc));}
 
 inline void gemm(cublasHandle_t handle,
                  cublasOperation_t transa,
@@ -93,8 +87,101 @@ inline void gemm(cublasHandle_t handle,
                            C,
                            ldc));}
 
-}  // Blas
+inline void gemm_batched(cublasHandle_t handle,
+                         cublasOperation_t transa, 
+                         cublasOperation_t transb,
+                         int m, int n, int k,
+                         const double *alpha,
+                         const double *Aarray[], int lda,
+                         const double *Barray[], int ldb,
+                         const double *beta,
+                         double          *Carray[], int ldc, 
+                         int batchCount) {
+  CUBLAS_CHECK(cublasDgemmBatched(handle,
+                                  transa, 
+                                  transb,
+                                  m, n, k,
+                                  alpha,
+                                  Aarray, lda,
+                                  Barray, ldb,
+                                  beta,
+                                  Carray, ldc, 
+                                  batchCount));
+}
 
+inline void gemm_batched(cublasHandle_t handle,
+                         cublasOperation_t transa, 
+                         cublasOperation_t transb,
+                         int m, int n, int k,
+                         const float *alpha,
+                         const float *Aarray[], int lda,
+                         const float *Barray[], int ldb,
+                         const float *beta,
+                         float *Carray[], int ldc, 
+                         int batchCount) {
+  CUBLAS_CHECK(cublasSgemmBatched(handle,
+                                  transa, 
+                                  transb,
+                                  m, n, k,
+                                  alpha,
+                                  Aarray, lda,
+                                  Barray, ldb,
+                                  beta,
+                                  Carray, ldc, 
+                                  batchCount));
+}
+
+inline void gemm_strided_batched(
+    cublasHandle_t handle, 
+    cublasOperation_t transA, cublasOperation_t transB,
+    int M, int N, int K, 
+    const double* alpha,
+    const double* A, int ldA, int strideA, 
+    const double* B, int ldB, int strideB, 
+    const double* beta,
+    double* C, int ldC, int strideC,
+    int batchCount) {
+  CUBLAS_CHECK(cublasDgemmStridedBatched(handle,
+                                         transA, 
+                                         transB,
+                                         M, N, K,
+                                         alpha,
+                                         A, ldA,
+                                         strideA,
+                                         B, ldB,
+                                         strideB,
+                                         beta,
+                                         C, ldC, 
+                                         strideC, 
+                                         batchCount));
+}
+
+inline void gemm_strided_batched(
+    cublasHandle_t handle, 
+    cublasOperation_t transA, cublasOperation_t transB,
+    int M, int N, int K, 
+    const float* alpha,
+    const float* A, int ldA, int strideA, 
+    const float* B, int ldB, int strideB, 
+    const float* beta,
+    float* C, int ldC, int strideC,
+    int batchCount) {
+  CUBLAS_CHECK(cublasSgemmStridedBatched(handle,
+                                         transA, 
+                                         transB,
+                                         M, N, K,
+                                         alpha,
+                                         A, ldA,
+                                         strideA,
+                                         B, ldB,
+                                         strideB,
+                                         beta,
+                                         C, ldC, 
+                                         strideC, 
+                                         batchCount));
+}
+
+}  // Blas
 }  // KMeans
 }  // H2O4GPU
 
 
@@ -15,15 +15,21 @@ M_DEVINLINE size_t global_thread_idx () {
   return threadIdx.x + blockIdx.x * blockDim.x;
 }
 
-M_DEVINLINE size_t grid_stride () {
+M_DEVINLINE size_t global_thread_idy () {
+  return threadIdx.y + blockIdx.y * blockDim.y; 
+}
+
+M_DEVINLINE size_t grid_stride_x () {
   return blockDim.x * gridDim.x;
 }
 
-// This wrapper function is created to work around a possible bug in nvcc,
-// which threats GpuInfo::ins() as calling base class method when used inside a
-// class member function.
-size_t get_blocks(size_t _mul, int _device=0) {
-  return GpuInfo::ins().blocks(_mul, _device);
+M_DEVINLINE size_t grid_stride_y () {
+  return blockDim.y * gridDim.y;
+}
+
+template <typename T1, typename T2>
+T1 M_HOSTDEVINLINE div_roundup(const T1 a, const T2 b) {
+  return static_cast<T1>(ceil(static_cast<double>(a) / b));
 }
 
 }  // KMeans