Merge pull request #11 from JetBrains-Research/cuda-managed

EgorOrachyov · web-flow · commit c26a1d56f409 · 2021-06-11T11:38:59.000+03:00
Cuda Managed Mem (minor fixes)
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -22,7 +22,6 @@ Added new vector C API, exposed vector primitive into python-package.
 - Vector creation (empty, from data, with random data)
 - Matrix-vector operations (matrix-vector and vector-matrix multiplication)
 - Vector-vector operations (element-wise addition)
-- Matrix operations (equality, reduce to value, extract sub-vector)
 - Vector data extraction (as list of indices)
 - Vector syntax sugar (pretty string printing, slicing, iterating through non-zero indices)
 - Matrix operations (extract row or matrix column as sparse vector, reduce matrix (optionally transposed) to vector)
diff --git a/README.md b/README.md
@@ -51,8 +51,8 @@ prototyping algorithms on a local computer for later running on a powerful serve
 ### Platforms
 
 - Linux based OS (tested on Ubuntu 20.04)
-- Windows (not tested yet)
-- macOS (not tested yet)
+- Windows (coming soon)
+- macOS (coming soon)
 
 ### Simple example
 
@@ -74,9 +74,32 @@ b[2, 1] = True
 print(a, b, a.mxm(b), sep="\n")
 ```
 
+### Performance
+
+Sparse Boolean matrix-matrix multiplication evaluation results are listed bellow.
+Machine configuration: PC with Ubuntu 20.04, Intel Core i7-6700 3.40GHz CPU, DDR4 64Gb RAM, GeForce GTX 1070 GPU with 8Gb VRAM. 
+
+![time](https://github.com/JetBrains-Research/cuBool/raw/master/docs/pictures/mxm-perf-time.svg?raw=true&sanitize=true)
+![mem](https://github.com/JetBrains-Research/cuBool/raw/master/docs/pictures/mxm-perf-mem.svg?raw=true&sanitize=true)
+
+The matrix data is selected from the SuiteSparse Matrix Collection [link](https://sparse.tamu.edu).
+
+| Matrix name              | # Rows      | Nnz M       | Nnz/row   | Max Nnz/row | Nnz M^2     |
+|---                       |---:         |---:         |---:       |---:         |---:         |
+| SNAP/amazon0312          | 400,727     | 3,200,440   | 7.9       | 10          | 14,390,544  |
+| LAW/amazon-2008          | 735,323     | 5,158,388   | 7.0       | 10          | 25,366,745  |
+| SNAP/web-Google          | 916,428     | 5,105,039   | 5.5       | 456         | 29,710,164  |
+| SNAP/roadNet-PA          | 1,090,920   | 3,083,796   | 2.8       | 9           | 7,238,920   |
+| SNAP/roadNet-TX	       | 1,393,383   | 3,843,320   | 2.7       | 12          | 8,903,897   |
+| SNAP/roadNet-CA	       | 1,971,281   | 5,533,214   | 2.8       | 12          | 12,908,450  |
+| DIMACS10/netherlands_osm | 2,216,688   | 4,882,476   | 2.2       | 7           | 8,755,758   |
+  
+Detailed comparison is available in the full paper text at 
+[link](https://github.com/YaccConstructor/articles/blob/master/2021/GRAPL/Sparse_Boolean_Algebra_on_GPGPU/Sparse_Boolean_Algebra_on_GPGPU.pdf).
+
 ### Installation
 
-If you are running **Linux based** OS (tested on Ubuntu 20.04) you can download the official
+If you are running **Linux-based** OS (tested on Ubuntu 20.04) you can download the official
 PyPI **pycubool** python package, which includes compiled library source code
 with Cuda and Sequential computations support. Installation process 
 requires only `python3` to be installed on your machine. Python can be installed 
@@ -102,7 +125,7 @@ These steps are required if you want to build library for your specific platform
 
 ### Requirements
 
-- Linux based OS (tested on Ubuntu 20.04)
+- Linux-based OS (tested on Ubuntu 20.04)
 - CMake Version 3.15 or higher
 - CUDA Compatible GPU device (to run Cuda computations)
 - GCC Compiler 
diff --git a/cubool/include/cubool/cubool.h b/cubool/include/cubool/cubool.h
@@ -118,6 +118,7 @@ typedef struct cuBool_Vector_t* cuBool_Vector;
 typedef struct cuBool_DeviceCaps {
     char name[256];
     bool cudaSupported;
+    bool managedMem;
     int major;
     int minor;
     int warp;
diff --git a/cubool/sources/core/library.cpp b/cubool/sources/core/library.cpp
@@ -249,6 +249,7 @@ namespace cubool {
     void Library::queryCapabilities(cuBool_DeviceCaps &caps) {
         caps.name[0] = '\0';
         caps.cudaSupported = false;
+        caps.managedMem = false;
         caps.major = 0;
         caps.minor = 0;
         caps.warp = 0;
@@ -272,6 +273,7 @@ namespace cubool {
                    << " name: " << caps.name << ","
                    << " major: " << caps.major << ","
                    << " minor: " << caps.minor << ","
+                   << " mem type: " << (caps.managedMem? "managed": "default") << ","
                    << " warp size: " << caps.warp << ","
                    << " globalMemoryKiBs: " << caps.globalMemoryKiBs << ","
                    << " sharedMemoryPerMultiProcKiBs: " << caps.sharedMemoryPerMultiProcKiBs << ","
diff --git a/cubool/sources/cuda/cuda_backend.cu b/cubool/sources/cuda/cuda_backend.cu
@@ -92,7 +92,7 @@ namespace cubool {
     }
 
     void CudaBackend::queryCapabilities(cuBool_DeviceCaps &caps) {
-        CudaInstance::queryDeviceCapabilities(caps);
+        mInstance->queryDeviceCapabilities(caps);
     }
 
     CudaInstance & CudaBackend::getInstance() {
diff --git a/cubool/sources/cuda/cuda_instance.cu b/cubool/sources/cuda/cuda_instance.cu
@@ -86,17 +86,7 @@ namespace cubool {
         }
     }
 
-    CudaInstance::MemType CudaInstance::getMemoryType() const {
-        return mMemoryType;
-    }
-
-    bool CudaInstance::isCudaDeviceSupported() {
-        int device;
-        cudaError error = cudaGetDevice(&device);
-        return error == cudaSuccess;
-    }
-
-    void CudaInstance::queryDeviceCapabilities(cuBool_DeviceCaps &deviceCaps) {
+    void CudaInstance::queryDeviceCapabilities(cuBool_DeviceCaps &deviceCaps) const {
         const unsigned long long KiB = 1024;
 
         int device;
@@ -109,6 +99,7 @@ namespace cubool {
             if (error == cudaSuccess) {
                 strcpy(deviceCaps.name, deviceProp.name);
                 deviceCaps.cudaSupported = true;
+                deviceCaps.managedMem = mMemoryType == MemType::Managed;
                 deviceCaps.minor = deviceProp.minor;
                 deviceCaps.major = deviceProp.major;
                 deviceCaps.warp = deviceProp.warpSize;
@@ -119,6 +110,16 @@ namespace cubool {
         }
     }
 
+    CudaInstance::MemType CudaInstance::getMemoryType() const {
+        return mMemoryType;
+    }
+
+    bool CudaInstance::isCudaDeviceSupported() {
+        int device;
+        cudaError error = cudaGetDevice(&device);
+        return error == cudaSuccess;
+    }
+
     void CudaInstance::allocate(void* &ptr, size_t size) const {
         ptr = malloc(size);
         CHECK_RAISE_ERROR(ptr != nullptr, MemOpFailed, "Failed to allocate memory on the CPU");
diff --git a/cubool/sources/cuda/cuda_instance.hpp b/cubool/sources/cuda/cuda_instance.hpp
@@ -51,12 +51,12 @@ namespace cubool {
         void allocateOnGpu(void* &ptr, size_t s) const;
         void deallocate(void* ptr) const;
         void deallocateOnGpu(void* ptr) const;
+        void queryDeviceCapabilities(cuBool_DeviceCaps& deviceCaps) const;
 
         void syncHostDevice() const;
         MemType getMemoryType() const;
 
         static bool isCudaDeviceSupported();
-        static void queryDeviceCapabilities(cuBool_DeviceCaps& deviceCaps);
         static CudaInstance& getInstanceRef();
         static CudaInstance* getInstancePtr();
         static bool isInstancePresent();
diff --git a/docs/pictures/mxm-perf-mem.svg b/docs/pictures/mxm-perf-mem.svg
diff --git a/docs/pictures/mxm-perf-time.svg b/docs/pictures/mxm-perf-time.svg
diff --git a/python/README.md b/python/README.md
@@ -31,8 +31,6 @@ prototyping algorithms on a local computer for later running on a powerful serve
 
 ### Features
 
-- C API for performance-critical computations
-- Python package for every-day tasks
 - Cuda backend for computations
 - Cpu backend for computations
 - Matrix/vector creation (empty, from data, with random data)
@@ -47,6 +45,29 @@ prototyping algorithms on a local computer for later running on a powerful serve
 - GraphViz (export single matrix or set of matrices as a graph with custom color and label settings)
 - Debug (matrix string debug markers, logging)
 
+### Performance
+
+Sparse Boolean matrix-matrix multiplication evaluation results are listed bellow.
+Machine configuration: PC with Ubuntu 20.04, Intel Core i7-6700 3.40GHz CPU, DDR4 64Gb RAM, GeForce GTX 1070 GPU with 8Gb VRAM. 
+
+![time](https://github.com/JetBrains-Research/cuBool/raw/master/docs/pictures/mxm-perf-time.svg?raw=true&sanitize=true)
+![mem](https://github.com/JetBrains-Research/cuBool/raw/master/docs/pictures/mxm-perf-mem.svg?raw=true&sanitize=true)
+
+The matrix data is selected from the SuiteSparse Matrix Collection [link](https://sparse.tamu.edu).
+
+| Matrix name              | # Rows      | Nnz M       | Nnz/row   | Max Nnz/row | Nnz M^2     |
+|---                       |---:         |---:         |---:       |---:         |---:         |
+| SNAP/amazon0312          | 400,727     | 3,200,440   | 7.9       | 10          | 14,390,544  |
+| LAW/amazon-2008          | 735,323     | 5,158,388   | 7.0       | 10          | 25,366,745  |
+| SNAP/web-Google          | 916,428     | 5,105,039   | 5.5       | 456         | 29,710,164  |
+| SNAP/roadNet-PA          | 1,090,920   | 3,083,796   | 2.8       | 9           | 7,238,920   |
+| SNAP/roadNet-TX	       | 1,393,383   | 3,843,320   | 2.7       | 12          | 8,903,897   |
+| SNAP/roadNet-CA	       | 1,971,281   | 5,533,214   | 2.8       | 12          | 12,908,450  |
+| DIMACS10/netherlands_osm | 2,216,688   | 4,882,476   | 2.2       | 7           | 8,755,758   |
+  
+Detailed comparison is available in the full paper text at 
+[link](https://github.com/YaccConstructor/articles/blob/master/2021/GRAPL/Sparse_Boolean_Algebra_on_GPGPU/Sparse_Boolean_Algebra_on_GPGPU.pdf).
+
 ### Simple example
 
 Create sparse matrices, compute matrix-matrix product and print the result to the output:

Original file line number	Diff line number	Diff line change
`@@ -92,7 +92,7 @@ namespace cubool {`
`92`	`92`	`}`
`93`	`93`
`94`	`94`	`void CudaBackend::queryCapabilities(cuBool_DeviceCaps &caps) {`
`95`		`- CudaInstance::queryDeviceCapabilities(caps);`
	`95`	`+ mInstance->queryDeviceCapabilities(caps);`
`96`	`96`	`}`
`97`	`97`
`98`	`98`	`CudaInstance & CudaBackend::getInstance() {`