diff --git a/python/pyarrow/tensor.pxi b/python/pyarrow/tensor.pxi index 031d47b9f50d5..6ee2d6adeca5d 100644 --- a/python/pyarrow/tensor.pxi +++ b/python/pyarrow/tensor.pxi @@ -382,19 +382,19 @@ shape: {self.shape}""" @staticmethod def from_scipy(obj, dim_names=None): """ - Convert scipy.sparse.coo_matrix to arrow::SparseCOOTensor + Convert scipy.sparse.coo_array or scipy.sparse.coo_matrix to arrow::SparseCOOTensor Parameters ---------- - obj : scipy.sparse.csr_matrix - The scipy matrix that should be converted. + obj : scipy.sparse.coo_array or scipy.sparse.coo_matrix + The scipy array or matrix that should be converted. dim_names : list, optional Names of the dimensions. """ import scipy.sparse - if not isinstance(obj, scipy.sparse.coo_matrix): + if not (isinstance(obj, scipy.sparse.coo_array) or isinstance(obj, scipy.sparse.coo_matrix)): raise TypeError( - f"Expected scipy.sparse.coo_matrix, got {type(obj)}") + f"Expected scipy.sparse.coo_array or scipy.sparse.coo_matrix, got {type(obj)}") cdef shared_ptr[CSparseCOOTensor] csparse_tensor cdef vector[int64_t] c_shape @@ -409,10 +409,11 @@ shape: {self.shape}""" row = obj.row col = obj.col - # When SciPy's coo_matrix has canonical format, its indices matrix is - # sorted in column-major order. As Arrow's SparseCOOIndex is sorted - # in row-major order if it is canonical, we must sort indices matrix - # into row-major order to keep its canonicalness, here. + # When SciPy's coo_array and coo_matrix have canonical format, their + # indices matrix is sorted in column-major order. As Arrow's + # SparseCOOIndex is sorted in row-major order if it is canonical, + # we must sort indices matrix into row-major order to keep it's + # canonicalness here. if obj.has_canonical_format: order = np.lexsort((col, row)) # sort in row-major order row = row[order] @@ -493,9 +494,9 @@ shape: {self.shape}""" def to_scipy(self): """ - Convert arrow::SparseCOOTensor to scipy.sparse.coo_matrix. + Convert arrow::SparseCOOTensor to scipy.sparse.coo_array. """ - from scipy.sparse import coo_matrix + from scipy.sparse import coo_array cdef PyObject* out_data cdef PyObject* out_coords @@ -504,12 +505,12 @@ shape: {self.shape}""" data = PyObject_to_object(out_data) coords = PyObject_to_object(out_coords) row, col = coords[:, 0], coords[:, 1] - result = coo_matrix((data[:, 0], (row, col)), shape=self.shape) + result = coo_array((data[:, 0], (row, col)), shape=self.shape) # As the description in from_scipy above, we sorted indices matrix - # in row-major order if SciPy's coo_matrix has canonical format. - # So, we must call sum_duplicates() to make the result coo_matrix - # has canonical format. + # in row-major order if SciPy's coo_array has canonical format. + # So, we must call sum_duplicates() to make the resulting coo_array + # have canonical format. if self.has_canonical_format: result.sum_duplicates() return result @@ -693,19 +694,19 @@ shape: {self.shape}""" @staticmethod def from_scipy(obj, dim_names=None): """ - Convert scipy.sparse.csr_matrix to arrow::SparseCSRMatrix. + Convert scipy.sparse.csr_array or scipy.sparse.csr_matrix to arrow::SparseCSRMatrix. Parameters ---------- - obj : scipy.sparse.csr_matrix + obj : scipy.sparse.csr_array or scipy.sparse.csr_matrix The scipy matrix that should be converted. dim_names : list, optional Names of the dimensions. """ import scipy.sparse - if not isinstance(obj, scipy.sparse.csr_matrix): + if not (isinstance(obj, scipy.sparse.csr_array) or isinstance(obj, scipy.sparse.csr_matrix)): raise TypeError( - f"Expected scipy.sparse.csr_matrix, got {type(obj)}") + f"Expected scipy.sparse.csr_array or scipy.sparse.csr_matrix, got {type(obj)}") cdef shared_ptr[CSparseCSRMatrix] csparse_tensor cdef vector[int64_t] c_shape @@ -764,9 +765,9 @@ shape: {self.shape}""" def to_scipy(self): """ - Convert arrow::SparseCSRMatrix to scipy.sparse.csr_matrix. + Convert arrow::SparseCSRMatrix to scipy.sparse.csr_array. """ - from scipy.sparse import csr_matrix + from scipy.sparse import csr_array cdef PyObject* out_data cdef PyObject* out_indptr cdef PyObject* out_indices @@ -778,7 +779,7 @@ shape: {self.shape}""" data = PyObject_to_object(out_data) indptr = PyObject_to_object(out_indptr) indices = PyObject_to_object(out_indices) - result = csr_matrix((data[:, 0], indices, indptr), shape=self.shape) + result = csr_array((data[:, 0], indices, indptr), shape=self.shape) return result def to_tensor(self): diff --git a/python/pyarrow/tests/test_sparse_tensor.py b/python/pyarrow/tests/test_sparse_tensor.py index 7ba9e2b3e13db..7ace3796b80b3 100644 --- a/python/pyarrow/tests/test_sparse_tensor.py +++ b/python/pyarrow/tests/test_sparse_tensor.py @@ -26,10 +26,12 @@ import pyarrow as pa try: - from scipy.sparse import csr_matrix, coo_matrix + from scipy.sparse import csr_array, coo_array, csr_matrix, coo_matrix except ImportError: coo_matrix = None csr_matrix = None + csr_array = None + coo_array = None try: import sparse @@ -51,6 +53,15 @@ ('f8', pa.float64()) ] +# Scipy does not support float16 +scipy_type_pairs = [ + x for x in tensor_type_pairs if x[1] != pa.float16()] + +shape_dim_name_pairs = [ + ((4, 6), ("x", "y")), + ((24,), ("x",)), +] + @pytest.mark.parametrize('sparse_tensor_type', [ pa.SparseCSRMatrix, @@ -395,17 +406,18 @@ def test_dense_to_sparse_tensor(dtype_str, arrow_type, sparse_tensor_type): @pytest.mark.skipif(not coo_matrix, reason="requires scipy") -@pytest.mark.parametrize('dtype_str,arrow_type', tensor_type_pairs) -def test_sparse_coo_tensor_scipy_roundtrip(dtype_str, arrow_type): +@pytest.mark.parametrize('sparse_object', (coo_array, coo_matrix)) +@pytest.mark.parametrize('dtype_str,arrow_type', scipy_type_pairs) +@pytest.mark.parametrize('shape,dim_names', shape_dim_name_pairs) +def test_sparse_coo_tensor_scipy_roundtrip(dtype_str, arrow_type, + sparse_object, shape, dim_names): dtype = np.dtype(dtype_str) data = np.array([1, 2, 3, 4, 5, 6]).astype(dtype) row = np.array([0, 0, 2, 3, 1, 3]) col = np.array([0, 2, 0, 4, 5, 5]) - shape = (4, 6) - dim_names = ('x', 'y') # non-canonical sparse coo matrix - scipy_matrix = coo_matrix((data, (row, col)), shape=shape) + scipy_matrix = sparse_object((data, (row, col)), shape=shape) sparse_tensor = pa.SparseCOOTensor.from_scipy(scipy_matrix, dim_names=dim_names) out_scipy_matrix = sparse_tensor.to_scipy() @@ -420,11 +432,7 @@ def test_sparse_coo_tensor_scipy_roundtrip(dtype_str, arrow_type): assert np.array_equal(scipy_matrix.row, out_scipy_matrix.row) assert np.array_equal(scipy_matrix.col, out_scipy_matrix.col) - if dtype_str == 'f2': - dense_array = \ - scipy_matrix.astype(np.float32).toarray().astype(np.float16) - else: - dense_array = scipy_matrix.toarray() + dense_array = scipy_matrix.toarray() assert np.array_equal(dense_array, sparse_tensor.to_tensor().to_numpy()) # canonical sparse coo matrix @@ -439,16 +447,17 @@ def test_sparse_coo_tensor_scipy_roundtrip(dtype_str, arrow_type): @pytest.mark.skipif(not csr_matrix, reason="requires scipy") -@pytest.mark.parametrize('dtype_str,arrow_type', tensor_type_pairs) -def test_sparse_csr_matrix_scipy_roundtrip(dtype_str, arrow_type): +@pytest.mark.parametrize('sparse_object', (csr_array, csr_matrix)) +@pytest.mark.parametrize('dtype_str,arrow_type', scipy_type_pairs) +@pytest.mark.parametrize('shape,dim_names', shape_dim_name_pairs) +def test_sparse_csr_matrix_scipy_roundtrip(dtype_str, arrow_type, + sparse_object, shape, dim_names): dtype = np.dtype(dtype_str) data = np.array([8, 2, 5, 3, 4, 6]).astype(dtype) indptr = np.array([0, 2, 3, 4, 6]) indices = np.array([0, 2, 5, 0, 4, 5]) - shape = (4, 6) - dim_names = ('x', 'y') - sparse_array = csr_matrix((data, indices, indptr), shape=shape) + sparse_array = sparse_object((data, indices, indptr), shape=shape) sparse_tensor = pa.SparseCSRMatrix.from_scipy(sparse_array, dim_names=dim_names) out_sparse_array = sparse_tensor.to_scipy() @@ -460,11 +469,7 @@ def test_sparse_csr_matrix_scipy_roundtrip(dtype_str, arrow_type): assert np.array_equal(sparse_array.indptr, out_sparse_array.indptr) assert np.array_equal(sparse_array.indices, out_sparse_array.indices) - if dtype_str == 'f2': - dense_array = \ - sparse_array.astype(np.float32).toarray().astype(np.float16) - else: - dense_array = sparse_array.toarray() + dense_array = sparse_array.toarray() assert np.array_equal(dense_array, sparse_tensor.to_tensor().to_numpy())