Skip to content

GH-45229: [Python] Migrate from scipy.spmatrix to scipy.sparray #46423

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Open
wants to merge 1 commit into
base: main
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
45 changes: 23 additions & 22 deletions python/pyarrow/tensor.pxi
Original file line number Diff line number Diff line change
Expand Up @@ -382,19 +382,19 @@ shape: {self.shape}"""
@staticmethod
def from_scipy(obj, dim_names=None):
"""
Convert scipy.sparse.coo_matrix to arrow::SparseCOOTensor
Convert scipy.sparse.coo_array or scipy.sparse.coo_matrix to arrow::SparseCOOTensor

Parameters
----------
obj : scipy.sparse.csr_matrix
The scipy matrix that should be converted.
obj : scipy.sparse.coo_array or scipy.sparse.coo_matrix
The scipy array or matrix that should be converted.
dim_names : list, optional
Names of the dimensions.
"""
import scipy.sparse
if not isinstance(obj, scipy.sparse.coo_matrix):
if not (isinstance(obj, scipy.sparse.coo_array) or isinstance(obj, scipy.sparse.coo_matrix)):
raise TypeError(
f"Expected scipy.sparse.coo_matrix, got {type(obj)}")
f"Expected scipy.sparse.coo_array or scipy.sparse.coo_matrix, got {type(obj)}")

cdef shared_ptr[CSparseCOOTensor] csparse_tensor
cdef vector[int64_t] c_shape
Expand All @@ -409,10 +409,11 @@ shape: {self.shape}"""
row = obj.row
col = obj.col

# When SciPy's coo_matrix has canonical format, its indices matrix is
# sorted in column-major order. As Arrow's SparseCOOIndex is sorted
# in row-major order if it is canonical, we must sort indices matrix
# into row-major order to keep its canonicalness, here.
# When SciPy's coo_array and coo_matrix have canonical format, their
# indices matrix is sorted in column-major order. As Arrow's
# SparseCOOIndex is sorted in row-major order if it is canonical,
# we must sort indices matrix into row-major order to keep it's
# canonicalness here.
if obj.has_canonical_format:
order = np.lexsort((col, row)) # sort in row-major order
row = row[order]
Expand Down Expand Up @@ -493,9 +494,9 @@ shape: {self.shape}"""

def to_scipy(self):
"""
Convert arrow::SparseCOOTensor to scipy.sparse.coo_matrix.
Convert arrow::SparseCOOTensor to scipy.sparse.coo_array.
"""
from scipy.sparse import coo_matrix
from scipy.sparse import coo_array
cdef PyObject* out_data
cdef PyObject* out_coords

Expand All @@ -504,12 +505,12 @@ shape: {self.shape}"""
data = PyObject_to_object(out_data)
coords = PyObject_to_object(out_coords)
row, col = coords[:, 0], coords[:, 1]
result = coo_matrix((data[:, 0], (row, col)), shape=self.shape)
result = coo_array((data[:, 0], (row, col)), shape=self.shape)

# As the description in from_scipy above, we sorted indices matrix
# in row-major order if SciPy's coo_matrix has canonical format.
# So, we must call sum_duplicates() to make the result coo_matrix
# has canonical format.
# in row-major order if SciPy's coo_array has canonical format.
# So, we must call sum_duplicates() to make the resulting coo_array
# have canonical format.
if self.has_canonical_format:
result.sum_duplicates()
return result
Expand Down Expand Up @@ -693,19 +694,19 @@ shape: {self.shape}"""
@staticmethod
def from_scipy(obj, dim_names=None):
"""
Convert scipy.sparse.csr_matrix to arrow::SparseCSRMatrix.
Convert scipy.sparse.csr_array or scipy.sparse.csr_matrix to arrow::SparseCSRMatrix.

Parameters
----------
obj : scipy.sparse.csr_matrix
obj : scipy.sparse.csr_array or scipy.sparse.csr_matrix
The scipy matrix that should be converted.
dim_names : list, optional
Names of the dimensions.
"""
import scipy.sparse
if not isinstance(obj, scipy.sparse.csr_matrix):
if not (isinstance(obj, scipy.sparse.csr_array) or isinstance(obj, scipy.sparse.csr_matrix)):
raise TypeError(
f"Expected scipy.sparse.csr_matrix, got {type(obj)}")
f"Expected scipy.sparse.csr_array or scipy.sparse.csr_matrix, got {type(obj)}")

cdef shared_ptr[CSparseCSRMatrix] csparse_tensor
cdef vector[int64_t] c_shape
Expand Down Expand Up @@ -764,9 +765,9 @@ shape: {self.shape}"""

def to_scipy(self):
"""
Convert arrow::SparseCSRMatrix to scipy.sparse.csr_matrix.
Convert arrow::SparseCSRMatrix to scipy.sparse.csr_array.
"""
from scipy.sparse import csr_matrix
from scipy.sparse import csr_array
cdef PyObject* out_data
cdef PyObject* out_indptr
cdef PyObject* out_indices
Expand All @@ -778,7 +779,7 @@ shape: {self.shape}"""
data = PyObject_to_object(out_data)
indptr = PyObject_to_object(out_indptr)
indices = PyObject_to_object(out_indices)
result = csr_matrix((data[:, 0], indices, indptr), shape=self.shape)
result = csr_array((data[:, 0], indices, indptr), shape=self.shape)
return result

def to_tensor(self):
Expand Down
47 changes: 26 additions & 21 deletions python/pyarrow/tests/test_sparse_tensor.py
Original file line number Diff line number Diff line change
Expand Up @@ -26,10 +26,12 @@
import pyarrow as pa

try:
from scipy.sparse import csr_matrix, coo_matrix
from scipy.sparse import csr_array, coo_array, csr_matrix, coo_matrix
except ImportError:
coo_matrix = None
csr_matrix = None
csr_array = None
coo_array = None

try:
import sparse
Expand All @@ -51,6 +53,15 @@
('f8', pa.float64())
]

# Scipy does not support float16
scipy_type_pairs = [
x for x in tensor_type_pairs if x[1] != pa.float16()]

shape_dim_name_pairs = [
((4, 6), ("x", "y")),
((24,), ("x",)),
]


@pytest.mark.parametrize('sparse_tensor_type', [
pa.SparseCSRMatrix,
Expand Down Expand Up @@ -395,17 +406,18 @@ def test_dense_to_sparse_tensor(dtype_str, arrow_type, sparse_tensor_type):


@pytest.mark.skipif(not coo_matrix, reason="requires scipy")
@pytest.mark.parametrize('dtype_str,arrow_type', tensor_type_pairs)
def test_sparse_coo_tensor_scipy_roundtrip(dtype_str, arrow_type):
@pytest.mark.parametrize('sparse_object', (coo_array, coo_matrix))
@pytest.mark.parametrize('dtype_str,arrow_type', scipy_type_pairs)
@pytest.mark.parametrize('shape,dim_names', shape_dim_name_pairs)
def test_sparse_coo_tensor_scipy_roundtrip(dtype_str, arrow_type,
sparse_object, shape, dim_names):
dtype = np.dtype(dtype_str)
data = np.array([1, 2, 3, 4, 5, 6]).astype(dtype)
row = np.array([0, 0, 2, 3, 1, 3])
col = np.array([0, 2, 0, 4, 5, 5])
shape = (4, 6)
dim_names = ('x', 'y')

# non-canonical sparse coo matrix
scipy_matrix = coo_matrix((data, (row, col)), shape=shape)
scipy_matrix = sparse_object((data, (row, col)), shape=shape)
sparse_tensor = pa.SparseCOOTensor.from_scipy(scipy_matrix,
dim_names=dim_names)
out_scipy_matrix = sparse_tensor.to_scipy()
Expand All @@ -420,11 +432,7 @@ def test_sparse_coo_tensor_scipy_roundtrip(dtype_str, arrow_type):
assert np.array_equal(scipy_matrix.row, out_scipy_matrix.row)
assert np.array_equal(scipy_matrix.col, out_scipy_matrix.col)

if dtype_str == 'f2':
dense_array = \
scipy_matrix.astype(np.float32).toarray().astype(np.float16)
else:
dense_array = scipy_matrix.toarray()
dense_array = scipy_matrix.toarray()
assert np.array_equal(dense_array, sparse_tensor.to_tensor().to_numpy())

# canonical sparse coo matrix
Expand All @@ -439,16 +447,17 @@ def test_sparse_coo_tensor_scipy_roundtrip(dtype_str, arrow_type):


@pytest.mark.skipif(not csr_matrix, reason="requires scipy")
@pytest.mark.parametrize('dtype_str,arrow_type', tensor_type_pairs)
def test_sparse_csr_matrix_scipy_roundtrip(dtype_str, arrow_type):
@pytest.mark.parametrize('sparse_object', (csr_array, csr_matrix))
@pytest.mark.parametrize('dtype_str,arrow_type', scipy_type_pairs)
@pytest.mark.parametrize('shape,dim_names', shape_dim_name_pairs)
def test_sparse_csr_matrix_scipy_roundtrip(dtype_str, arrow_type,
sparse_object, shape, dim_names):
dtype = np.dtype(dtype_str)
data = np.array([8, 2, 5, 3, 4, 6]).astype(dtype)
indptr = np.array([0, 2, 3, 4, 6])
indices = np.array([0, 2, 5, 0, 4, 5])
shape = (4, 6)
dim_names = ('x', 'y')

sparse_array = csr_matrix((data, indices, indptr), shape=shape)
sparse_array = sparse_object((data, indices, indptr), shape=shape)
sparse_tensor = pa.SparseCSRMatrix.from_scipy(sparse_array,
dim_names=dim_names)
out_sparse_array = sparse_tensor.to_scipy()
Expand All @@ -460,11 +469,7 @@ def test_sparse_csr_matrix_scipy_roundtrip(dtype_str, arrow_type):
assert np.array_equal(sparse_array.indptr, out_sparse_array.indptr)
assert np.array_equal(sparse_array.indices, out_sparse_array.indices)

if dtype_str == 'f2':
dense_array = \
sparse_array.astype(np.float32).toarray().astype(np.float16)
else:
dense_array = sparse_array.toarray()
dense_array = sparse_array.toarray()
assert np.array_equal(dense_array, sparse_tensor.to_tensor().to_numpy())


Expand Down
Loading