Skip to content

Commit 1ead380

Browse files
committed
Initial commit
1 parent 10f2c9c commit 1ead380

File tree

2 files changed

+49
-43
lines changed

2 files changed

+49
-43
lines changed

python/pyarrow/tensor.pxi

Lines changed: 23 additions & 22 deletions
Original file line numberDiff line numberDiff line change
@@ -382,19 +382,19 @@ shape: {self.shape}"""
382382
@staticmethod
383383
def from_scipy(obj, dim_names=None):
384384
"""
385-
Convert scipy.sparse.coo_matrix to arrow::SparseCOOTensor
385+
Convert scipy.sparse.coo_array or scipy.sparse.coo_matrix to arrow::SparseCOOTensor
386386
387387
Parameters
388388
----------
389-
obj : scipy.sparse.csr_matrix
390-
The scipy matrix that should be converted.
389+
obj : scipy.sparse.coo_array or scipy.sparse.coo_matrix
390+
The scipy array or matrix that should be converted.
391391
dim_names : list, optional
392392
Names of the dimensions.
393393
"""
394394
import scipy.sparse
395-
if not isinstance(obj, scipy.sparse.coo_matrix):
395+
if not (isinstance(obj, scipy.sparse.coo_array) or isinstance(obj, scipy.sparse.coo_matrix)):
396396
raise TypeError(
397-
f"Expected scipy.sparse.coo_matrix, got {type(obj)}")
397+
f"Expected scipy.sparse.coo_array or scipy.sparse.coo_matrix, got {type(obj)}")
398398

399399
cdef shared_ptr[CSparseCOOTensor] csparse_tensor
400400
cdef vector[int64_t] c_shape
@@ -409,10 +409,11 @@ shape: {self.shape}"""
409409
row = obj.row
410410
col = obj.col
411411

412-
# When SciPy's coo_matrix has canonical format, its indices matrix is
413-
# sorted in column-major order. As Arrow's SparseCOOIndex is sorted
414-
# in row-major order if it is canonical, we must sort indices matrix
415-
# into row-major order to keep its canonicalness, here.
412+
# When SciPy's coo_array and coo_matrix have canonical format, their
413+
# indices matrix is sorted in column-major order. As Arrow's
414+
# SparseCOOIndex is sorted in row-major order if it is canonical,
415+
# we must sort indices matrix into row-major order to keep it's
416+
# canonicalness here.
416417
if obj.has_canonical_format:
417418
order = np.lexsort((col, row)) # sort in row-major order
418419
row = row[order]
@@ -493,9 +494,9 @@ shape: {self.shape}"""
493494

494495
def to_scipy(self):
495496
"""
496-
Convert arrow::SparseCOOTensor to scipy.sparse.coo_matrix.
497+
Convert arrow::SparseCOOTensor to scipy.sparse.coo_array.
497498
"""
498-
from scipy.sparse import coo_matrix
499+
from scipy.sparse import coo_array
499500
cdef PyObject* out_data
500501
cdef PyObject* out_coords
501502

@@ -504,12 +505,12 @@ shape: {self.shape}"""
504505
data = PyObject_to_object(out_data)
505506
coords = PyObject_to_object(out_coords)
506507
row, col = coords[:, 0], coords[:, 1]
507-
result = coo_matrix((data[:, 0], (row, col)), shape=self.shape)
508+
result = coo_array((data[:, 0], (row, col)), shape=self.shape)
508509

509510
# As the description in from_scipy above, we sorted indices matrix
510-
# in row-major order if SciPy's coo_matrix has canonical format.
511-
# So, we must call sum_duplicates() to make the result coo_matrix
512-
# has canonical format.
511+
# in row-major order if SciPy's coo_array has canonical format.
512+
# So, we must call sum_duplicates() to make the resulting coo_array
513+
# have canonical format.
513514
if self.has_canonical_format:
514515
result.sum_duplicates()
515516
return result
@@ -693,19 +694,19 @@ shape: {self.shape}"""
693694
@staticmethod
694695
def from_scipy(obj, dim_names=None):
695696
"""
696-
Convert scipy.sparse.csr_matrix to arrow::SparseCSRMatrix.
697+
Convert scipy.sparse.csr_array or scipy.sparse.csr_matrix to arrow::SparseCSRMatrix.
697698
698699
Parameters
699700
----------
700-
obj : scipy.sparse.csr_matrix
701+
obj : scipy.sparse.csr_array or scipy.sparse.csr_matrix
701702
The scipy matrix that should be converted.
702703
dim_names : list, optional
703704
Names of the dimensions.
704705
"""
705706
import scipy.sparse
706-
if not isinstance(obj, scipy.sparse.csr_matrix):
707+
if not (isinstance(obj, scipy.sparse.csr_array) or isinstance(obj, scipy.sparse.csr_matrix)):
707708
raise TypeError(
708-
f"Expected scipy.sparse.csr_matrix, got {type(obj)}")
709+
f"Expected scipy.sparse.csr_array or scipy.sparse.csr_matrix, got {type(obj)}")
709710

710711
cdef shared_ptr[CSparseCSRMatrix] csparse_tensor
711712
cdef vector[int64_t] c_shape
@@ -764,9 +765,9 @@ shape: {self.shape}"""
764765

765766
def to_scipy(self):
766767
"""
767-
Convert arrow::SparseCSRMatrix to scipy.sparse.csr_matrix.
768+
Convert arrow::SparseCSRMatrix to scipy.sparse.csr_array.
768769
"""
769-
from scipy.sparse import csr_matrix
770+
from scipy.sparse import csr_array
770771
cdef PyObject* out_data
771772
cdef PyObject* out_indptr
772773
cdef PyObject* out_indices
@@ -778,7 +779,7 @@ shape: {self.shape}"""
778779
data = PyObject_to_object(out_data)
779780
indptr = PyObject_to_object(out_indptr)
780781
indices = PyObject_to_object(out_indices)
781-
result = csr_matrix((data[:, 0], indices, indptr), shape=self.shape)
782+
result = csr_array((data[:, 0], indices, indptr), shape=self.shape)
782783
return result
783784

784785
def to_tensor(self):

python/pyarrow/tests/test_sparse_tensor.py

Lines changed: 26 additions & 21 deletions
Original file line numberDiff line numberDiff line change
@@ -26,10 +26,12 @@
2626
import pyarrow as pa
2727

2828
try:
29-
from scipy.sparse import csr_matrix, coo_matrix
29+
from scipy.sparse import csr_array, coo_array, csr_matrix, coo_matrix
3030
except ImportError:
3131
coo_matrix = None
3232
csr_matrix = None
33+
csr_array = None
34+
coo_array = None
3335

3436
try:
3537
import sparse
@@ -51,6 +53,15 @@
5153
('f8', pa.float64())
5254
]
5355

56+
# Scipy does not support float16
57+
scipy_type_pairs = [
58+
x for x in tensor_type_pairs if x[1] != pa.float16()]
59+
60+
shape_dim_name_pairs = [
61+
((4, 6), ("x", "y")),
62+
((24,), ("x",)),
63+
]
64+
5465

5566
@pytest.mark.parametrize('sparse_tensor_type', [
5667
pa.SparseCSRMatrix,
@@ -395,17 +406,18 @@ def test_dense_to_sparse_tensor(dtype_str, arrow_type, sparse_tensor_type):
395406

396407

397408
@pytest.mark.skipif(not coo_matrix, reason="requires scipy")
398-
@pytest.mark.parametrize('dtype_str,arrow_type', tensor_type_pairs)
399-
def test_sparse_coo_tensor_scipy_roundtrip(dtype_str, arrow_type):
409+
@pytest.mark.parametrize('sparse_object', (coo_array, coo_matrix))
410+
@pytest.mark.parametrize('dtype_str,arrow_type', scipy_type_pairs)
411+
@pytest.mark.parametrize('shape,dim_names', shape_dim_name_pairs)
412+
def test_sparse_coo_tensor_scipy_roundtrip(dtype_str, arrow_type,
413+
sparse_object, shape, dim_names):
400414
dtype = np.dtype(dtype_str)
401415
data = np.array([1, 2, 3, 4, 5, 6]).astype(dtype)
402416
row = np.array([0, 0, 2, 3, 1, 3])
403417
col = np.array([0, 2, 0, 4, 5, 5])
404-
shape = (4, 6)
405-
dim_names = ('x', 'y')
406418

407419
# non-canonical sparse coo matrix
408-
scipy_matrix = coo_matrix((data, (row, col)), shape=shape)
420+
scipy_matrix = sparse_object((data, (row, col)), shape=shape)
409421
sparse_tensor = pa.SparseCOOTensor.from_scipy(scipy_matrix,
410422
dim_names=dim_names)
411423
out_scipy_matrix = sparse_tensor.to_scipy()
@@ -420,11 +432,7 @@ def test_sparse_coo_tensor_scipy_roundtrip(dtype_str, arrow_type):
420432
assert np.array_equal(scipy_matrix.row, out_scipy_matrix.row)
421433
assert np.array_equal(scipy_matrix.col, out_scipy_matrix.col)
422434

423-
if dtype_str == 'f2':
424-
dense_array = \
425-
scipy_matrix.astype(np.float32).toarray().astype(np.float16)
426-
else:
427-
dense_array = scipy_matrix.toarray()
435+
dense_array = scipy_matrix.toarray()
428436
assert np.array_equal(dense_array, sparse_tensor.to_tensor().to_numpy())
429437

430438
# canonical sparse coo matrix
@@ -439,16 +447,17 @@ def test_sparse_coo_tensor_scipy_roundtrip(dtype_str, arrow_type):
439447

440448

441449
@pytest.mark.skipif(not csr_matrix, reason="requires scipy")
442-
@pytest.mark.parametrize('dtype_str,arrow_type', tensor_type_pairs)
443-
def test_sparse_csr_matrix_scipy_roundtrip(dtype_str, arrow_type):
450+
@pytest.mark.parametrize('sparse_object', (csr_array, csr_matrix))
451+
@pytest.mark.parametrize('dtype_str,arrow_type', scipy_type_pairs)
452+
@pytest.mark.parametrize('shape,dim_names', shape_dim_name_pairs)
453+
def test_sparse_csr_matrix_scipy_roundtrip(dtype_str, arrow_type,
454+
sparse_object, shape, dim_names):
444455
dtype = np.dtype(dtype_str)
445456
data = np.array([8, 2, 5, 3, 4, 6]).astype(dtype)
446457
indptr = np.array([0, 2, 3, 4, 6])
447458
indices = np.array([0, 2, 5, 0, 4, 5])
448-
shape = (4, 6)
449-
dim_names = ('x', 'y')
450459

451-
sparse_array = csr_matrix((data, indices, indptr), shape=shape)
460+
sparse_array = sparse_object((data, indices, indptr), shape=shape)
452461
sparse_tensor = pa.SparseCSRMatrix.from_scipy(sparse_array,
453462
dim_names=dim_names)
454463
out_sparse_array = sparse_tensor.to_scipy()
@@ -460,11 +469,7 @@ def test_sparse_csr_matrix_scipy_roundtrip(dtype_str, arrow_type):
460469
assert np.array_equal(sparse_array.indptr, out_sparse_array.indptr)
461470
assert np.array_equal(sparse_array.indices, out_sparse_array.indices)
462471

463-
if dtype_str == 'f2':
464-
dense_array = \
465-
sparse_array.astype(np.float32).toarray().astype(np.float16)
466-
else:
467-
dense_array = sparse_array.toarray()
472+
dense_array = sparse_array.toarray()
468473
assert np.array_equal(dense_array, sparse_tensor.to_tensor().to_numpy())
469474

470475

0 commit comments

Comments
 (0)