Skip to content

Commit 084b199

Browse files
TST (string dtype): resolve xfails in pandas/tests/copy_view (pandas-dev#60245)
1 parent 7740c4e commit 084b199

File tree

5 files changed

+46
-57
lines changed

5 files changed

+46
-57
lines changed

pandas/_testing/__init__.py

+9-19
Original file line numberDiff line numberDiff line change
@@ -7,7 +7,6 @@
77
from typing import (
88
TYPE_CHECKING,
99
ContextManager,
10-
cast,
1110
)
1211

1312
import numpy as np
@@ -21,8 +20,6 @@
2120

2221
from pandas.compat import pa_version_under10p1
2322

24-
from pandas.core.dtypes.common import is_string_dtype
25-
2623
import pandas as pd
2724
from pandas import (
2825
ArrowDtype,
@@ -77,8 +74,8 @@
7774
with_csv_dialect,
7875
)
7976
from pandas.core.arrays import (
77+
ArrowExtensionArray,
8078
BaseMaskedArray,
81-
ExtensionArray,
8279
NumpyExtensionArray,
8380
)
8481
from pandas.core.arrays._mixins import NDArrayBackedExtensionArray
@@ -92,7 +89,6 @@
9289
NpDtype,
9390
)
9491

95-
from pandas.core.arrays import ArrowExtensionArray
9692

9793
UNSIGNED_INT_NUMPY_DTYPES: list[NpDtype] = ["uint8", "uint16", "uint32", "uint64"]
9894
UNSIGNED_INT_EA_DTYPES: list[Dtype] = ["UInt8", "UInt16", "UInt32", "UInt64"]
@@ -512,24 +508,18 @@ def shares_memory(left, right) -> bool:
512508
if isinstance(left, pd.core.arrays.IntervalArray):
513509
return shares_memory(left._left, right) or shares_memory(left._right, right)
514510

515-
if (
516-
isinstance(left, ExtensionArray)
517-
and is_string_dtype(left.dtype)
518-
and left.dtype.storage == "pyarrow" # type: ignore[attr-defined]
519-
):
520-
# https://github.com/pandas-dev/pandas/pull/43930#discussion_r736862669
521-
left = cast("ArrowExtensionArray", left)
522-
if (
523-
isinstance(right, ExtensionArray)
524-
and is_string_dtype(right.dtype)
525-
and right.dtype.storage == "pyarrow" # type: ignore[attr-defined]
526-
):
527-
right = cast("ArrowExtensionArray", right)
511+
if isinstance(left, ArrowExtensionArray):
512+
if isinstance(right, ArrowExtensionArray):
513+
# https://github.com/pandas-dev/pandas/pull/43930#discussion_r736862669
528514
left_pa_data = left._pa_array
529515
right_pa_data = right._pa_array
530516
left_buf1 = left_pa_data.chunk(0).buffers()[1]
531517
right_buf1 = right_pa_data.chunk(0).buffers()[1]
532-
return left_buf1 == right_buf1
518+
return left_buf1.address == right_buf1.address
519+
else:
520+
# if we have one one ArrowExtensionArray and one other array, assume
521+
# they can only share memory if they share the same numpy buffer
522+
return np.shares_memory(left, right)
533523

534524
if isinstance(left, BaseMaskedArray) and isinstance(right, BaseMaskedArray):
535525
# By convention, we'll say these share memory if they share *either*

pandas/tests/copy_view/test_astype.py

+12-10
Original file line numberDiff line numberDiff line change
@@ -3,8 +3,6 @@
33
import numpy as np
44
import pytest
55

6-
from pandas._config import using_string_dtype
7-
86
from pandas.compat import HAS_PYARROW
97
from pandas.compat.pyarrow import pa_version_under12p0
108

@@ -206,7 +204,6 @@ def test_astype_arrow_timestamp():
206204
assert np.shares_memory(get_array(df, "a"), get_array(result, "a")._pa_array)
207205

208206

209-
@pytest.mark.xfail(using_string_dtype() and HAS_PYARROW, reason="TODO(infer_string)")
210207
def test_convert_dtypes_infer_objects():
211208
ser = Series(["a", "b", "c"])
212209
ser_orig = ser.copy()
@@ -217,20 +214,25 @@ def test_convert_dtypes_infer_objects():
217214
convert_string=False,
218215
)
219216

220-
assert np.shares_memory(get_array(ser), get_array(result))
217+
assert tm.shares_memory(get_array(ser), get_array(result))
221218
result.iloc[0] = "x"
222219
tm.assert_series_equal(ser, ser_orig)
223220

224221

225-
@pytest.mark.xfail(using_string_dtype() and HAS_PYARROW, reason="TODO(infer_string)")
226-
def test_convert_dtypes():
222+
def test_convert_dtypes(using_infer_string):
227223
df = DataFrame({"a": ["a", "b"], "b": [1, 2], "c": [1.5, 2.5], "d": [True, False]})
228224
df_orig = df.copy()
229225
df2 = df.convert_dtypes()
230226

231-
assert np.shares_memory(get_array(df2, "a"), get_array(df, "a"))
232-
assert np.shares_memory(get_array(df2, "d"), get_array(df, "d"))
233-
assert np.shares_memory(get_array(df2, "b"), get_array(df, "b"))
234-
assert np.shares_memory(get_array(df2, "c"), get_array(df, "c"))
227+
if using_infer_string and HAS_PYARROW:
228+
# TODO the default nullable string dtype still uses python storage
229+
# this should be changed to pyarrow if installed
230+
assert not tm.shares_memory(get_array(df2, "a"), get_array(df, "a"))
231+
else:
232+
assert tm.shares_memory(get_array(df2, "a"), get_array(df, "a"))
233+
assert tm.shares_memory(get_array(df2, "d"), get_array(df, "d"))
234+
assert tm.shares_memory(get_array(df2, "b"), get_array(df, "b"))
235+
assert tm.shares_memory(get_array(df2, "c"), get_array(df, "c"))
235236
df2.iloc[0, 0] = "x"
237+
df2.iloc[0, 1] = 10
236238
tm.assert_frame_equal(df, df_orig)

pandas/tests/copy_view/test_functions.py

-1
Original file line numberDiff line numberDiff line change
@@ -153,7 +153,6 @@ def test_concat_copy_keyword():
153153
assert np.shares_memory(get_array(df2, "b"), get_array(result, "b"))
154154

155155

156-
# @pytest.mark.xfail(using_string_dtype() and HAS_PYARROW, reason="TODO(infer_string)")
157156
@pytest.mark.parametrize(
158157
"func",
159158
[

pandas/tests/copy_view/test_methods.py

+21-17
Original file line numberDiff line numberDiff line change
@@ -1,8 +1,6 @@
11
import numpy as np
22
import pytest
33

4-
from pandas._config import using_string_dtype
5-
64
from pandas.compat import HAS_PYARROW
75

86
import pandas as pd
@@ -716,14 +714,18 @@ def test_head_tail(method):
716714
tm.assert_frame_equal(df, df_orig)
717715

718716

719-
@pytest.mark.xfail(using_string_dtype() and HAS_PYARROW, reason="TODO(infer_string)")
720-
def test_infer_objects():
721-
df = DataFrame({"a": [1, 2], "b": "c", "c": 1, "d": "x"})
717+
def test_infer_objects(using_infer_string):
718+
df = DataFrame(
719+
{"a": [1, 2], "b": Series(["x", "y"], dtype=object), "c": 1, "d": "x"}
720+
)
722721
df_orig = df.copy()
723722
df2 = df.infer_objects()
724723

725724
assert np.shares_memory(get_array(df2, "a"), get_array(df, "a"))
726-
assert np.shares_memory(get_array(df2, "b"), get_array(df, "b"))
725+
if using_infer_string and HAS_PYARROW:
726+
assert not tm.shares_memory(get_array(df2, "b"), get_array(df, "b"))
727+
else:
728+
assert np.shares_memory(get_array(df2, "b"), get_array(df, "b"))
727729

728730
df2.iloc[0, 0] = 0
729731
df2.iloc[0, 1] = "d"
@@ -732,19 +734,16 @@ def test_infer_objects():
732734
tm.assert_frame_equal(df, df_orig)
733735

734736

735-
@pytest.mark.xfail(
736-
using_string_dtype() and not HAS_PYARROW, reason="TODO(infer_string)"
737-
)
738-
def test_infer_objects_no_reference():
737+
def test_infer_objects_no_reference(using_infer_string):
739738
df = DataFrame(
740739
{
741740
"a": [1, 2],
742-
"b": "c",
741+
"b": Series(["x", "y"], dtype=object),
743742
"c": 1,
744743
"d": Series(
745744
[Timestamp("2019-12-31"), Timestamp("2020-12-31")], dtype="object"
746745
),
747-
"e": "b",
746+
"e": Series(["z", "w"], dtype=object),
748747
}
749748
)
750749
df = df.infer_objects()
@@ -757,16 +756,22 @@ def test_infer_objects_no_reference():
757756
df.iloc[0, 1] = "d"
758757
df.iloc[0, 3] = Timestamp("2018-12-31")
759758
assert np.shares_memory(arr_a, get_array(df, "a"))
760-
# TODO(CoW): Block splitting causes references here
761-
assert not np.shares_memory(arr_b, get_array(df, "b"))
759+
if using_infer_string and HAS_PYARROW:
760+
# note that the underlying memory of arr_b has been copied anyway
761+
# because of the assignment, but the EA is updated inplace so still
762+
# appears the share memory
763+
assert tm.shares_memory(arr_b, get_array(df, "b"))
764+
else:
765+
# TODO(CoW): Block splitting causes references here
766+
assert not np.shares_memory(arr_b, get_array(df, "b"))
762767
assert np.shares_memory(arr_d, get_array(df, "d"))
763768

764769

765770
def test_infer_objects_reference():
766771
df = DataFrame(
767772
{
768773
"a": [1, 2],
769-
"b": "c",
774+
"b": Series(["x", "y"], dtype=object),
770775
"c": 1,
771776
"d": Series(
772777
[Timestamp("2019-12-31"), Timestamp("2020-12-31")], dtype="object"
@@ -904,14 +909,13 @@ def test_sort_values_inplace(obj, kwargs):
904909
tm.assert_equal(view, obj_orig)
905910

906911

907-
@pytest.mark.xfail(using_string_dtype() and HAS_PYARROW, reason="TODO(infer_string)")
908912
@pytest.mark.parametrize("decimals", [-1, 0, 1])
909913
def test_round(decimals):
910914
df = DataFrame({"a": [1, 2], "b": "c"})
911915
df_orig = df.copy()
912916
df2 = df.round(decimals=decimals)
913917

914-
assert np.shares_memory(get_array(df2, "b"), get_array(df, "b"))
918+
assert tm.shares_memory(get_array(df2, "b"), get_array(df, "b"))
915919
# TODO: Make inplace by using out parameter of ndarray.round?
916920
if decimals >= 0:
917921
# Ensure lazy copy if no-op

pandas/tests/copy_view/test_replace.py

+4-10
Original file line numberDiff line numberDiff line change
@@ -1,10 +1,6 @@
11
import numpy as np
22
import pytest
33

4-
from pandas._config import using_string_dtype
5-
6-
from pandas.compat import HAS_PYARROW
7-
84
from pandas import (
95
Categorical,
106
DataFrame,
@@ -13,7 +9,6 @@
139
from pandas.tests.copy_view.util import get_array
1410

1511

16-
@pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)")
1712
@pytest.mark.parametrize(
1813
"replace_kwargs",
1914
[
@@ -30,14 +25,14 @@
3025
],
3126
)
3227
def test_replace(replace_kwargs):
33-
df = DataFrame({"a": [1, 2, 3], "b": [4, 5, 6], "c": ["foo", "bar", "baz"]})
28+
df = DataFrame({"a": [1, 2, 3], "b": [4, 5, 6], "c": [0.1, 0.2, 0.3]})
3429
df_orig = df.copy()
3530

3631
df_replaced = df.replace(**replace_kwargs)
3732

3833
if (df_replaced["b"] == df["b"]).all():
3934
assert np.shares_memory(get_array(df_replaced, "b"), get_array(df, "b"))
40-
assert np.shares_memory(get_array(df_replaced, "c"), get_array(df, "c"))
35+
assert tm.shares_memory(get_array(df_replaced, "c"), get_array(df, "c"))
4136

4237
# mutating squeezed df triggers a copy-on-write for that column/block
4338
df_replaced.loc[0, "c"] = -1
@@ -61,18 +56,17 @@ def test_replace_regex_inplace_refs():
6156
tm.assert_frame_equal(view, df_orig)
6257

6358

64-
@pytest.mark.xfail(using_string_dtype() and HAS_PYARROW, reason="TODO(infer_string)")
6559
def test_replace_regex_inplace():
6660
df = DataFrame({"a": ["aaa", "bbb"]})
6761
arr = get_array(df, "a")
6862
df.replace(to_replace=r"^a.*$", value="new", inplace=True, regex=True)
6963
assert df._mgr._has_no_reference(0)
70-
assert np.shares_memory(arr, get_array(df, "a"))
64+
assert tm.shares_memory(arr, get_array(df, "a"))
7165

7266
df_orig = df.copy()
7367
df2 = df.replace(to_replace=r"^b.*$", value="new", regex=True)
7468
tm.assert_frame_equal(df_orig, df)
75-
assert not np.shares_memory(get_array(df2, "a"), get_array(df, "a"))
69+
assert not tm.shares_memory(get_array(df2, "a"), get_array(df, "a"))
7670

7771

7872
def test_replace_regex_inplace_no_op():

0 commit comments

Comments
 (0)