diff --git a/pykokkos/__init__.py b/pykokkos/__init__.py
index 9340945f..04131ba6 100644
--- a/pykokkos/__init__.py
+++ b/pykokkos/__init__.py
@@ -72,6 +72,7 @@
     ceil,
     floor,
     broadcast_view,
+    cumsum,
 )
 from pykokkos.lib.info import iinfo, finfo
 from pykokkos.lib.create import zeros, zeros_like, ones, ones_like, full, full_like
diff --git a/pykokkos/lib/ufuncs.py b/pykokkos/lib/ufuncs.py
index fe741bd1..1c84b4bb 100644
--- a/pykokkos/lib/ufuncs.py
+++ b/pykokkos/lib/ufuncs.py
@@ -3164,6 +3164,333 @@ def argmax(view, axis=None):
     return view
 
 
+# NOTE: these workunits/kernels really make little sense
+# to a Python developer who hasn't read i.e., the C++ parallel_scan
+# docs for Kokkos--could we not make this more Pythonic?
+# How does this behave for OpenMP vs. CUDA? The latter is quite
+# a complex parallel algorithm I think, and the amount of work
+# with multiple scans/passes under the hood is pretty hidden here
+
+
+@pk.workunit
+def cumsum_impl_1d_double(
+    tid: int,
+    acc: pk.Acc[pk.double],
+    last_pass: bool,
+    view: pk.View1D[pk.double],
+    new_view: pk.View1D[pk.double],
+):
+    acc += view[tid]
+    new_view[tid] = acc
+    if last_pass:
+        view[tid] = acc
+
+
+@pk.workunit
+def cumsum_impl_1d_float(
+    tid: int,
+    acc: pk.Acc[pk.float],
+    last_pass: bool,
+    view: pk.View1D[pk.float],
+    new_view: pk.View1D[pk.float],
+):
+    acc += view[tid]
+    new_view[tid] = acc
+    if last_pass:
+        view[tid] = acc
+
+
+@pk.workunit
+def cumsum_impl_1d_int32(
+    tid: int,
+    acc: pk.Acc[pk.int32],
+    last_pass: bool,
+    view: pk.View1D[pk.int32],
+    new_view: pk.View1D[pk.int32],
+):
+    acc += view[tid]
+    new_view[tid] = acc
+    if last_pass:
+        view[tid] = acc
+
+
+@pk.workunit
+def cumsum_impl_1d_int64(
+    tid: int,
+    acc: pk.Acc[pk.int64],
+    last_pass: bool,
+    view: pk.View1D[pk.int64],
+    new_view: pk.View1D[pk.int64],
+):
+    acc += view[tid]
+    new_view[tid] = acc
+    if last_pass:
+        view[tid] = acc
+
+
+@pk.workunit
+def cumsum_impl_2d_double(
+    tid: int,
+    acc: pk.Acc[pk.double],
+    last_pass: bool,
+    view: pk.View2D[pk.double],
+    new_view: pk.View2D[pk.double],
+):
+    # NOTE: by default, NumPy assigns the result
+    # to a new flattened array, but it is not clear
+    # to me how we'd do that here; while we can make
+    # new_view 1D, the iteration behavior is fairly opaque,
+    # and possibly not even guaranteed depending on the backend
+    # if I understood the feedback from ctrott?
+    for j in range(view.extent(1)):
+        acc += view[tid][j]
+        new_view[tid][j] = acc
+        if last_pass:
+            view[tid][j] = acc
+
+
+@pk.workunit
+def cumsum_impl_2d_float(
+    tid: int,
+    acc: pk.Acc[pk.float],
+    last_pass: bool,
+    view: pk.View2D[pk.float],
+    new_view: pk.View2D[pk.float],
+):
+    for j in range(view.extent(1)):
+        acc += view[tid][j]
+        new_view[tid][j] = acc
+        if last_pass:
+            view[tid][j] = acc
+
+
+@pk.workunit
+def cumsum_impl_2d_int32(
+    tid: int,
+    acc: pk.Acc[pk.int32],
+    last_pass: bool,
+    view: pk.View2D[pk.int32],
+    new_view: pk.View2D[pk.int32],
+):
+    for j in range(view.extent(1)):
+        acc += view[tid][j]
+        new_view[tid][j] = acc
+        if last_pass:
+            view[tid][j] = acc
+
+
+@pk.workunit
+def cumsum_impl_2d_int64(
+    tid: int,
+    acc: pk.Acc[pk.int64],
+    last_pass: bool,
+    view: pk.View2D[pk.int64],
+    new_view: pk.View2D[pk.int64],
+):
+    for j in range(view.extent(1)):
+        acc += view[tid][j]
+        new_view[tid][j] = acc
+        if last_pass:
+            view[tid][j] = acc
+
+
+@pk.workunit
+def cumsum_impl_3d_double(
+    tid: int,
+    acc: pk.Acc[pk.double],
+    last_pass: bool,
+    view: pk.View3D[pk.double],
+    new_view: pk.View3D[pk.double],
+):
+    for j in range(view.extent(1)):
+        for k in range(view.extent(2)):
+            acc += view[tid][j][k]
+            new_view[tid][j][k] = acc
+            if last_pass:
+                view[tid][j][k] = acc
+
+
+@pk.workunit
+def cumsum_impl_3d_float(
+    tid: int,
+    acc: pk.Acc[pk.float],
+    last_pass: bool,
+    view: pk.View3D[pk.float],
+    new_view: pk.View3D[pk.float],
+):
+    for j in range(view.extent(1)):
+        for k in range(view.extent(2)):
+            acc += view[tid][j][k]
+            new_view[tid][j][k] = acc
+            if last_pass:
+                view[tid][j][k] = acc
+
+
+@pk.workunit
+def cumsum_impl_3d_int32(
+    tid: int,
+    acc: pk.Acc[pk.int32],
+    last_pass: bool,
+    view: pk.View3D[pk.int32],
+    new_view: pk.View3D[pk.int32],
+):
+    for j in range(view.extent(1)):
+        for k in range(view.extent(2)):
+            acc += view[tid][j][k]
+            new_view[tid][j][k] = acc
+            if last_pass:
+                view[tid][j][k] = acc
+
+
+@pk.workunit
+def cumsum_impl_3d_int64(
+    tid: int,
+    acc: pk.Acc[pk.int64],
+    last_pass: bool,
+    view: pk.View3D[pk.int64],
+    new_view: pk.View3D[pk.int64],
+):
+    for j in range(view.extent(1)):
+        for k in range(view.extent(2)):
+            acc += view[tid][j][k]
+            new_view[tid][j][k] = acc
+            if last_pass:
+                view[tid][j][k] = acc
+
+
+def cumsum(view):
+    """
+    Return the cumulative sum of the elements.
+
+    Parameters
+    ----------
+    view : pykokkos view or NumPy array
+
+    Returns
+    -------
+    y : pykokkos view or NumPy array
+
+    """
+    # TODO: support axis-aligned operation like the NumPy version
+    # TODO: support the accumulator and output dtype specification
+    # like NumPy
+    # TODO: support an output array argument for placing the result
+    # at another memory location, as NumPy allows
+
+    # NOTE: parallel over the left-most dimension, but is this really
+    # guaranteed to produce optimal parallelism in all cases/for all
+    # backends?
+    if isinstance(view, (np.ndarray, np.generic)):
+        if np.issubdtype(view.dtype, np.float64):
+            view_loc = pk.View(view.shape, pk.double)
+        elif np.issubdtype(view.dtype, np.float32):
+            view_loc = pk.View(view.shape, pk.float)
+        elif np.issubdtype(view.dtype, np.int32):
+            view_loc = pk.View(view.shape, pk.int32)
+        elif np.issubdtype(view.dtype, np.int64):
+            view_loc = pk.View(view.shape, pk.int64)
+        view_loc[:] = view
+        view = view_loc
+        arr_type = "numpy"
+    else:
+        # NOTE: this arr_type stuff will probably need a better
+        # design than just these strings eventually..
+        arr_type = "kokkos"
+    range_policy = pk.RangePolicy(pk.ExecutionSpace.Default, 0, view.shape[0])
+    dtype_name = view.dtype.__name__
+    if (dtype_name == "double" or dtype_name == "float64") and len(view.shape) == 1:
+        new_view = pk.View(view.shape, pk.double)
+        pk.parallel_scan(
+            range_policy, cumsum_impl_1d_double, view=view, new_view=new_view
+        )
+    elif (dtype_name == "float" or dtype_name == "float32") and len(view.shape) == 1:
+        new_view = pk.View(view.shape, pk.float)
+        pk.parallel_scan(
+            range_policy, cumsum_impl_1d_float, view=view, new_view=new_view
+        )
+    elif dtype_name == "int32" and len(view.shape) == 1:
+        new_view = pk.View(view.shape, pk.int32)
+        pk.parallel_scan(
+            range_policy, cumsum_impl_1d_int32, view=view, new_view=new_view
+        )
+    elif dtype_name == "int64" and len(view.shape) == 1:
+        new_view = pk.View(view.shape, pk.int64)
+        pk.parallel_scan(
+            range_policy, cumsum_impl_1d_int64, view=view, new_view=new_view
+        )
+    # NOTE: careful here--the default NumPy behavior is to calculate
+    # cumsum over the *flattened* array, ignoring shape of the input
+    elif (dtype_name == "double" or dtype_name == "float64") and len(view.shape) == 2:
+        new_view = pk.View(view.shape, pk.double)
+        pk.parallel_scan(
+            range_policy, cumsum_impl_2d_double, view=view, new_view=new_view
+        )
+        new_view = np.reshape(new_view, view.size)
+    elif (dtype_name == "float" or dtype_name == "float32") and len(view.shape) == 2:
+        new_view = pk.View(view.shape, pk.float)
+        pk.parallel_scan(
+            range_policy, cumsum_impl_2d_float, view=view, new_view=new_view
+        )
+        new_view = np.reshape(new_view, view.size)
+    elif dtype_name == "int32" and len(view.shape) == 2:
+        new_view = pk.View(view.shape, pk.int32)
+        pk.parallel_scan(
+            range_policy, cumsum_impl_2d_int32, view=view, new_view=new_view
+        )
+        new_view = np.reshape(new_view, view.size)
+    elif dtype_name == "int64" and len(view.shape) == 2:
+        new_view = pk.View(view.shape, pk.int64)
+        pk.parallel_scan(
+            range_policy, cumsum_impl_2d_int64, view=view, new_view=new_view
+        )
+        new_view = np.reshape(new_view, view.size)
+    elif (dtype_name == "double" or dtype_name == "float64") and len(view.shape) == 3:
+        new_view = pk.View(view.shape, pk.double)
+        pk.parallel_scan(
+            range_policy, cumsum_impl_3d_double, view=view, new_view=new_view
+        )
+        new_view = np.reshape(new_view, view.size)
+    elif (dtype_name == "float" or dtype_name == "float32") and len(view.shape) == 3:
+        new_view = pk.View(view.shape, pk.float)
+        pk.parallel_scan(
+            range_policy, cumsum_impl_3d_float, view=view, new_view=new_view
+        )
+        new_view = np.reshape(new_view, view.size)
+    elif dtype_name == "int32" and len(view.shape) == 3:
+        new_view = pk.View(view.shape, pk.int32)
+        pk.parallel_scan(
+            range_policy, cumsum_impl_3d_int32, view=view, new_view=new_view
+        )
+        new_view = np.reshape(new_view, view.size)
+    elif dtype_name == "int64" and len(view.shape) == 3:
+        new_view = pk.View(view.shape, pk.int64)
+        pk.parallel_scan(
+            range_policy, cumsum_impl_3d_int64, view=view, new_view=new_view
+        )
+        new_view = np.reshape(new_view, view.size)
+    else:
+        raise NotImplementedError(
+            f"cumsum not yet implemented for dtype {view.dtype} and shape {view.shape}. "
+            f"Currently supported: double/float/int32/int64 for 1D/2D/3D views."
+        )
+    # try to return the same type you receive
+    if arr_type == "kokkos":
+        dtype_name = view.dtype.__name__
+        if dtype_name == "float" or dtype_name == "float32":
+            temp_view = pk.View([new_view.size], pk.float)
+        elif dtype_name == "double" or dtype_name == "float64":
+            temp_view = pk.View([new_view.size], pk.double)
+        elif dtype_name == "int32":
+            temp_view = pk.View([new_view.size], pk.int32)
+        elif dtype_name == "int64":
+            temp_view = pk.View([new_view.size], pk.int64)
+        temp_view[:] = new_view
+        new_view = temp_view
+    else:
+        new_view = np.asarray(new_view)
+    return new_view
+
+
 # TODO: Implement parallel sorting + filtering
 def unique(view):
     res = np.unique(view)
diff --git a/tests/test_ufuncs.py b/tests/test_ufuncs.py
index fd97c5e3..7df700ce 100644
--- a/tests/test_ufuncs.py
+++ b/tests/test_ufuncs.py
@@ -338,413 +338,43 @@ def test_caching():
 
 
 @pytest.mark.parametrize(
-    "pk_ufunc, numpy_ufunc",
-    [
-        (pk.reciprocal, np.reciprocal),
-    ],
-)
-@pytest.mark.parametrize(
-    "pk_dtype, numpy_dtype",
-    [
-        (pk.double, np.float64),
-        (pk.float, np.float32),
-    ],
-)
-def test_2d_exposed_ufuncs_vs_numpy(pk_ufunc, numpy_ufunc, pk_dtype, numpy_dtype):
-    rng = default_rng(123)
-    in_arr = rng.random((5, 5)).astype(numpy_dtype)
-    expected = numpy_ufunc(in_arr)
-
-    view: pk.View2d = pk.View([5, 5], pk_dtype)
-    view[:] = in_arr
-    actual = pk_ufunc(view=view)
-    assert_allclose(actual, expected)
-
-
-@pytest.mark.parametrize(
-    "pk_ufunc, numpy_ufunc",
+    "arr",
     [
-        (pk.np_matmul, np.matmul),
+        np.arange(110),
+        np.ones((5, 3)) * 7.2,
+        np.ones((2, 3, 2)) * -3.19,
     ],
 )
 @pytest.mark.parametrize(
     "pk_dtype, numpy_dtype",
     [
-        (pk.double, np.float64),
-        (pk.float, np.float32),
-    ],
-)
-@pytest.mark.parametrize(
-    "test_dim", [[4, 4, 4, 4], [4, 3, 3, 4], [1, 1, 1, 1], [2, 5, 5, 1]]
-)
-def test_np_matmul_2d_2d_vs_numpy(
-    pk_ufunc, numpy_ufunc, pk_dtype, numpy_dtype, test_dim
-):
-
-    N1 = test_dim[0]
-    M1 = test_dim[1]
-    N2 = test_dim[2]
-    M2 = test_dim[3]
-    rng = default_rng(123)
-    np1 = rng.random((N1, M1)).astype(numpy_dtype)
-    np2 = rng.random((N2, M2)).astype(numpy_dtype)
-    expected = numpy_ufunc(np1, np2)
-
-    view1: pk.View2d = pk.View([N1, M1], pk_dtype)
-    view1[:] = np1
-    view2: pk.View2d = pk.View([N2, M2], pk_dtype)
-    view2[:] = np2
-    actual = pk_ufunc(view1, view2)
-
-    assert_allclose(actual, expected, rtol=1.5e-7)
-
-
-@pytest.mark.parametrize(
-    "pk_ufunc, numpy_ufunc",
-    [
-        (pk.np_matmul, np.matmul),
-    ],
-)
-@pytest.mark.parametrize(
-    "numpy_dtype",
-    [
-        (np.float64),
-        (np.float32),
-    ],
-)
-@pytest.mark.parametrize("test_dim", [[4, 4, 4], [4, 3, 3], [1, 1, 1], [2, 5, 5]])
-def test_np_matmul_2d_1d_vs_numpy(pk_ufunc, numpy_ufunc, numpy_dtype, test_dim):
-
-    N1 = test_dim[0]
-    M1 = test_dim[1]
-    N2 = test_dim[2]
-    rng = default_rng(123)
-    np1 = rng.random((N1, M1)).astype(numpy_dtype)
-    np2 = rng.random(N2).astype(numpy_dtype)
-    expected = numpy_ufunc(np1, np2)
-
-    view1 = pk.array(np1)
-    view2 = pk.array(np2)
-    actual = pk_ufunc(view1, view2)
-
-    assert_allclose(actual, expected)
-
-
-@pytest.mark.parametrize(
-    "pk_ufunc, numpy_ufunc",
-    [
-        (pk.np_matmul, np.matmul),
-    ],
-)
-@pytest.mark.parametrize(
-    "numpy_dtype",
-    [
-        (np.float64),
-        (np.float32),
-    ],
-)
-@pytest.mark.parametrize("test_dim", [[4, 4, 4], [3, 3, 6], [1, 1, 1], [5, 5, 1]])
-def test_np_matmul_1d_2d_vs_numpy(pk_ufunc, numpy_ufunc, numpy_dtype, test_dim):
-
-    N1 = test_dim[0]
-    N2 = test_dim[1]
-    M2 = test_dim[2]
-    rng = default_rng(123)
-    np1 = rng.random(N1).astype(numpy_dtype)
-    np2 = rng.random((N2, M2)).astype(numpy_dtype)
-    expected = numpy_ufunc(np1, np2)
-
-    view1 = pk.array(np1)
-    view2 = pk.array(np2)
-    actual = pk_ufunc(view1, view2)
-
-    assert_allclose(actual, expected)
-
-
-@pytest.mark.parametrize(
-    "numpy_dtype",
-    [
-        (np.float64),
-        (np.float32),
-    ],
-)
-@pytest.mark.parametrize(
-    "test_dim", [[4, 3, 3], [3, 1, 6], [1, 4, 2], [5, 6, 1], [4, 3, 2, 1], [2, 3, 2, 4]]
-)
-def test_np_matmul_fails(numpy_dtype, test_dim):
-    N1 = None
-    N2 = None
-    M1 = None
-    M2 = None
-    np1 = None
-    rng = default_rng(123)
-
-    if len(test_dim) == 3:
-        N1 = test_dim[0]
-        N2 = test_dim[1]
-        M2 = test_dim[2]
-        np1 = rng.random(N1).astype(numpy_dtype)
-
-    if len(test_dim) == 4:
-        N1 = test_dim[0]
-        M1 = test_dim[1]
-        N2 = test_dim[2]
-        M2 = test_dim[3]
-        np1 = rng.random((N1, M1)).astype(numpy_dtype)
-
-    np2 = rng.random((N2, M2)).astype(numpy_dtype)
-
-    with pytest.raises(RuntimeError) as e_info:
-        view1 = pk.array(np1)
-        view2 = pk.array(np2)
-        pk.np_matmul(view1, view2)  # Should fail with 1d x 2d
-
-    err_np_matmul = (
-        "Matrix dimensions are not compatible for multiplication: {} and {}".format(
-            view1.shape, view2.shape
-        )
-    )
-    assert e_info.value.args[0] == err_np_matmul
-
-    with pytest.raises(RuntimeError) as e_info:
-        pk.np_matmul(view2, view1)  # should fail with 2d x 1 as well
-
-    err_np_matmul = (
-        "Matrix dimensions are not compatible for multiplication: {} and {}".format(
-            view2.shape, view1.shape
-        )
-    )
-    assert e_info.value.args[0] == err_np_matmul
-
-
-@pytest.mark.parametrize(
-    "pk_ufunc, numpy_ufunc",
-    [(pk.subtract, np.subtract), (pk.add, np.add), (pk.multiply, np.multiply)],
-)
-@pytest.mark.parametrize(
-    "numpy_dtype",
-    [
-        (np.float64),
-        (np.float32),
+        (pk.float64, np.float64),
+        (pk.float32, np.float32),
+        (pk.int32, np.int32),
+        (pk.int64, np.int64),
     ],
 )
-def test_multi_array_2d_exposed_ufuncs_vs_numpy(pk_ufunc, numpy_ufunc, numpy_dtype):
-    N = 4
-    M = 7
-    rng = default_rng(123)
-    np1 = rng.random((N, M)).astype(numpy_dtype)
-    np2 = rng.random((N, M)).astype(numpy_dtype)
-    expected = numpy_ufunc(np1, np2)
-
-    view1 = pk.array(np1)
-    view2 = pk.array(np2)
-    actual = pk_ufunc(view1, view2)
-
-    assert_allclose(actual, expected)
-
-
-@pytest.mark.parametrize(
-    "pk_ufunc, numpy_ufunc",
-    [
-        (pk.subtract, np.subtract),
-    ],
-)
-@pytest.mark.parametrize(
-    "numpy_dtype",
-    [
-        (np.float64),
-        (np.float32),
-    ],
-)
-@pytest.mark.parametrize(
-    "test_dim",
-    [[4, 3, 1, 1], [4, 3, 1, 3], [4, 3, 4, 1], [4, 3, 1], [4, 3, 3], [4, 3], [4]],
-)
-def test_broadcast_array_exposed_ufuncs_vs_numpy(
-    pk_ufunc, numpy_ufunc, numpy_dtype, test_dim
-):
-
-    np1 = None
-    np2 = None
-    rng = default_rng(123)
-    scalar = 3.0
-
-    if len(test_dim) == 4:
-        np1 = rng.random((test_dim[0], test_dim[1])).astype(numpy_dtype)
-        np2 = rng.random((test_dim[2], test_dim[3])).astype(numpy_dtype)
-    elif len(test_dim) == 3:
-        np1 = rng.random((test_dim[0], test_dim[1])).astype(numpy_dtype)
-        np2 = rng.random((test_dim[2])).astype(numpy_dtype)
-    elif len(test_dim) == 2:
-        np1 = rng.random((test_dim[0], test_dim[1])).astype(numpy_dtype)
-        np2 = scalar  # 2d with scalar
-    elif len(test_dim) == 1:
-        np1 = rng.random((test_dim[0])).astype(numpy_dtype)
-        np2 = scalar  # 1d with scalar
+@pytest.mark.parametrize("arr_type", ["numpy", "kokkos"])
+def test_cumsum_ufunc(arr, pk_dtype, numpy_dtype, arr_type):
+    expected = np.cumsum(arr, dtype=numpy_dtype)
+    if arr_type == "kokkos":
+        view = pk.View(arr.shape, pk_dtype)
+        view[:] = arr
     else:
-        raise NotImplementedError(
-            "Invalid test conditions: Broadcasting operations are only supported uptil 2D"
-        )
-
-    assert (
-        np1 is not None and np2 is not None
-    ), "Invalid test conditions: Are parameters uptil 2D?"
-
-    expected = numpy_ufunc(np1, np2)
-
-    view1 = pk.array(np1)
-    view2 = pk.array(np2) if isinstance(np2, np.ndarray) else np2
-    actual = pk_ufunc(view1, view2)
-
-    assert_allclose(expected, actual)
-
-
-@pytest.mark.parametrize(
-    "pk_dtype, numpy_dtype",
-    [
-        (pk.double, np.float64),
-        (pk.float, np.float32),
-    ],
-)
-@pytest.mark.parametrize(
-    "in_arr",
-    [
-        np.array([-5, 4.5, np.nan]),
-        np.array([np.nan, np.nan, np.nan]),
-    ],
-)
-def test_sign_1d_special_cases(in_arr, pk_dtype, numpy_dtype):
-    in_arr = in_arr.astype(numpy_dtype)
-    view: pk.View1D = pk.View([in_arr.size], pk_dtype)
-    view[:] = in_arr
-    expected = np.sign(in_arr)
-    actual = pk.sign(view=view)
-    assert_allclose(actual, expected)
-
-
-@pytest.mark.parametrize(
-    "pk_ufunc, numpy_ufunc",
-    [
-        (pk.copyto, np.copyto),
-    ],
-)
-@pytest.mark.parametrize(
-    "numpy_dtype",
-    [
-        (np.float64),
-        (np.float32),
-    ],
-)
-def test_copyto_1d(pk_ufunc, numpy_ufunc, numpy_dtype):
-    N = 4
-    M = 7
-    rng = default_rng(123)
-    np1 = rng.random((N, M)).astype(numpy_dtype)
-    np2 = rng.random((N, M)).astype(numpy_dtype)
-    numpy_ufunc(np1, np2)
-
-    view1 = pk.array(np1)
-    view2 = pk.array(np2)
-    pk_ufunc(view1, view2)
-
-    assert_allclose(np1, view1)
-
-
-@pytest.mark.parametrize(
-    "pk_ufunc, numpy_ufunc",
-    [
-        (pk.subtract, np.subtract),
-    ],
-)
-@pytest.mark.parametrize(
-    "numpy_dtype",
-    [
-        (np.float64),
-        (np.float32),
-    ],
-)
-@pytest.mark.parametrize(
-    "test_dim",
-    [
-        [4, 3, 4, 3],
-        [4, 3, 1, 1],
-        [4, 3, 1, 3],
-        [4, 3, 4, 1],
-        [4, 3, 1],
-        [4, 3, 3],
-        [4, 3],
-        [4],
-    ],
-)
-def test_copyto_broadcast_2d(pk_ufunc, numpy_ufunc, numpy_dtype, test_dim):
-    np1 = None
-    np2 = None
-    rng = default_rng(123)
-    scalar = 3.0
-
-    if len(test_dim) == 4:
-        np1 = rng.random((test_dim[0], test_dim[1])).astype(numpy_dtype)
-        np2 = rng.random((test_dim[2], test_dim[3])).astype(numpy_dtype)
-    elif len(test_dim) == 3:
-        np1 = rng.random((test_dim[0], test_dim[1])).astype(numpy_dtype)
-        np2 = rng.random((test_dim[2])).astype(numpy_dtype)
-    elif len(test_dim) == 2:
-        np1 = rng.random((test_dim[0], test_dim[1])).astype(numpy_dtype)
-        np2 = scalar  # 2d with scalar
-    elif len(test_dim) == 1:
-        np1 = rng.random((test_dim[0])).astype(numpy_dtype)
-        np2 = scalar  # 1d with scalar
+        view = arr.astype(numpy_dtype)
+    actual = pk.cumsum(view=view)
+    assert_allclose(actual, expected, rtol=1.3e-7)
+    # beyond the correct numerical results,
+    # let's also confirm that there is no memory
+    # overlap between the input and output array-like
+    # objects, and that pykokkos views/NumPy arrays
+    # as input result in the same type of output
+    assert not np.may_share_memory(actual, arr)
+    assert not np.may_share_memory(actual, view)
+    if arr_type == "kokkos":
+        # NOTE: could we get proper inheritance/instance
+        # checking here eventually?
+        assert "pykokkos" in str(type(actual))
+        assert "View" in str(type(actual))
     else:
-        raise NotImplementedError(
-            "Invalid test conditions: Broadcasting operations are only supported uptil 2D"
-        )
-
-    assert (
-        np1 is not None and np2 is not None
-    ), "Invalid test conditions: Are parameters uptil 2D?"
-
-    numpy_ufunc(np1, np2)
-
-    view1 = pk.array(np1)
-    view2 = pk.array(np2) if isinstance(np2, np.ndarray) else np2
-    pk_ufunc(view1, view2)
-
-    assert_allclose(np1, view1)
-
-
-@pytest.mark.parametrize(
-    "input_dtype",
-    [
-        pk.double,
-        pk.float,
-    ],
-)
-@pytest.mark.parametrize(
-    "pk_ufunc",
-    [
-        pk.floor,
-        pk.round,
-        pk.ceil,
-        pk.trunc,
-    ],
-)
-@pytest.mark.parametrize(
-    "shape",
-    [
-        [1],
-        [1, 1],
-        [1, 1, 1],
-    ],
-)
-def test_rounding_dtype_preservation(input_dtype, pk_ufunc, shape):
-    # at the time of writing the array API standard
-    # conformance test suite doesn't appear to probe
-    # floating point data types for many of the rounding
-    # functions
-
-    # for now, we simply test data type preservation
-    # of output vs. input so that we flush these codepaths
-    # a bit
-    view = pk.View(shape, input_dtype)
-    actual_dtype = pk_ufunc(view).dtype
-    assert actual_dtype.value == input_dtype.value
+        assert isinstance(actual, (np.ndarray, np.generic))
diff --git a/tools/pre_compile_ufuncs.py b/tools/pre_compile_ufuncs.py
index 0a035500..157026e5 100644
--- a/tools/pre_compile_ufuncs.py
+++ b/tools/pre_compile_ufuncs.py
@@ -63,9 +63,9 @@ def main():
                 except TypeError:
                     try:
                         func_obj(v)
-                    except (NotImplementedError, RuntimeError, KeyError):
+                    except (NotImplementedError, RuntimeError, KeyError, ImportError):
                         pass
-                except RuntimeError:
+                except (RuntimeError, ImportError):
                     # some cases like matmul have stricter
                     # signature requirements
                     if "matmul" in func[0]:
@@ -74,7 +74,7 @@ def main():
                         v2 = pk.View(new_shape, dtype=dtype)
                         try:
                             func_obj(v, v2)
-                        except RuntimeError:
+                        except (RuntimeError, ImportError):
                             pass
                     else:
                         pass