Improve performance of GCXS dot ndarray (#643)

jcapriot · web-flow · commit cb6b604698f4 · 2024-02-20T09:15:26.000Z
* improve memory access order for csr_ndarray_dot operation

* improve access order for csc_ndarray_dot

* re-use csr_dot_ndarray for ndarray_dot_csc

* make it easier for numba to optimize the csc_ndarray function.

* add benchmarking

* ensure aligned memory
diff --git a/benchmarks/benchmark_gcxs.py b/benchmarks/benchmark_gcxs.py
@@ -67,3 +67,28 @@ def time_index_slice3(self):
 
     def time_index_fancy(self):
         self.x[self.index]
+
+
+class DenseMultiplySuite:
+    params = ([0, 1], [1, 20, 100])
+    param_names = ["compressed axis", "n_vectors"]
+
+    def setup(self, compressed_axis, n_vecs):
+        rng = np.random.default_rng(1337)
+        n = 10000
+        x = sparse.random((n, n), density=0.001, format="gcxs", random_state=rng).change_compressed_axes(
+            (compressed_axis,)
+        )
+        self.x = x
+        self.t = rng.random((n, n_vecs))
+        self.u = rng.random((n_vecs, n))
+
+        # Numba compilation
+        self.x @ self.t
+        self.u @ self.x
+
+    def time_gcxs_dot_ndarray(self, *args):
+        self.x @ self.t
+
+    def time_ndarray_dot_gcxs(self, *args):
+        self.u @ self.x
diff --git a/sparse/_common.py b/sparse/_common.py
@@ -414,7 +414,8 @@ def _dot(a, b, return_type=None):
 
         # compressed_axes == (1,)
         if return_type is None or return_type == np.ndarray:
-            return _dot_ndarray_csc_type(a.dtype, b.dtype)(out_shape, b.data, b.indices, b.indptr, a)
+            out = _dot_csr_ndarray_type(bt.dtype, at.dtype)(out_shape[::-1], bt.data, bt.indices, bt.indptr, at)
+            return out.T
         data, indices, indptr = _dot_csr_ndarray_type_sparse(bt.dtype, at.dtype)(
             out_shape[::-1], bt.data, bt.indices, bt.indptr, at
         )
@@ -717,15 +718,15 @@ def _dot_csr_ndarray(out_shape, a_data, a_indices, a_indptr, b):  # pragma: no c
         out_shape : Tuple[int]
             The shape of the output array.
         """
-        out = np.empty(out_shape, dtype=dtr)
+        b = np.ascontiguousarray(b)  # ensure memory aligned
+        out = np.zeros(out_shape, dtype=dtr)
         for i in range(out_shape[0]):
-            for j in range(out_shape[1]):
-                val = 0
-                for k in range(a_indptr[i], a_indptr[i + 1]):
-                    ind = a_indices[k]
-                    v = a_data[k]
-                    val += v * b[ind, j]
-                out[i, j] = val
+            val = out[i]
+            for k in range(a_indptr[i], a_indptr[i + 1]):
+                ind = a_indices[k]
+                v = a_data[k]
+                for j in range(out_shape[1]):
+                    val[j] += v * b[ind, j]
         return out
 
     return _dot_csr_ndarray
@@ -866,51 +867,20 @@ def _dot_csc_ndarray(a_shape, b_shape, a_data, a_indices, a_indptr, b):  # pragm
         a_shape, b_shape : Tuple[int]
             The shapes of the input arrays.
         """
+        b = np.ascontiguousarray(b)  # ensure memory aligned
         out = np.zeros((a_shape[0], b_shape[1]), dtype=dtr)
-        for j in range(b_shape[1]):
-            for i in range(b_shape[0]):
-                for k in range(a_indptr[i], a_indptr[i + 1]):
-                    out[a_indices[k], j] += a_data[k] * b[i, j]
+        for i in range(b_shape[0]):
+            for k in range(a_indptr[i], a_indptr[i + 1]):
+                ind = a_indices[k]
+                v = a_data[k]
+                val = out[ind]
+                for j in range(b_shape[1]):
+                    val[j] += v * b[i, j]
         return out
 
     return _dot_csc_ndarray
 
 
-@_memoize_dtype
-def _dot_ndarray_csc_type(dt1, dt2):
-    dtr = _dot_dtype(dt1, dt2)
-
-    @numba.jit(
-        nopython=True,
-        nogil=True,
-        locals={"data_curr": numba.np.numpy_support.from_dtype(dtr)},
-    )
-    def _dot_ndarray_csc(out_shape, b_data, b_indices, b_indptr, a):  # pragma: no cover
-        """
-        Utility function taking in one `ndarray` and one ``GCXS`` and
-        calculating their dot product: a @ b for b with compressed columns.
-
-        Parameters
-        ----------
-        a : np.ndarray
-            The input array ``a``.
-        b_data, b_indices, b_indptr : np.ndarray
-            The data, indices, and index pointers of ``b``.
-        out_shape : Tuple[int]
-            The shape of the output array.
-        """
-        out = np.empty(out_shape, dtype=dtr)
-        for i in range(out_shape[0]):
-            for j in range(out_shape[1]):
-                total = 0
-                for k in range(b_indptr[j], b_indptr[j + 1]):
-                    total += a[i, b_indices[k]] * b_data[k]
-                out[i, j] = total
-        return out
-
-    return _dot_ndarray_csc
-
-
 @_memoize_dtype
 def _dot_coo_coo_type(dt1, dt2):
     dtr = _dot_dtype(dt1, dt2)