Skip to content

Commit 3143f44

Browse files
Backport PR #60795: TST(string dtype): Resolve xfails in pytables (#60916)
* ENH: Improved error message and raise new error for small-string NaN edge case in HDFStore.append (#60829) * Add clearer error messages for datatype mismatch in HDFStore.append. Raise ValueError when nan_rep too large for pytable column. Add and modify applicable test code. * Fix missed tests and correct mistake in error message. * Remove excess comments. Reverse error type change to avoid api changes. Move nan_rep tests into separate function. (cherry picked from commit 57340ec) * TST(string dtype): Resolve xfails in pytables (#60795) (cherry picked from commit 4511251) * Adjust test --------- Co-authored-by: Jake Thomas Trevallion <[email protected]>
1 parent 12cd45e commit 3143f44

13 files changed

+145
-151
lines changed

pandas/io/pytables.py

+3
Original file line numberDiff line numberDiff line change
@@ -5093,6 +5093,9 @@ def _maybe_convert_for_string_atom(
50935093
errors,
50945094
columns: list[str],
50955095
):
5096+
if isinstance(bvalues.dtype, StringDtype):
5097+
# "ndarray[Any, Any]" has no attribute "to_numpy"
5098+
bvalues = bvalues.to_numpy() # type: ignore[union-attr]
50965099
if bvalues.dtype != object:
50975100
return bvalues
50985101

pandas/tests/io/pytables/test_append.py

+30-26
Original file line numberDiff line numberDiff line change
@@ -25,10 +25,7 @@
2525
ensure_clean_store,
2626
)
2727

28-
pytestmark = [
29-
pytest.mark.single_cpu,
30-
pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)", strict=False),
31-
]
28+
pytestmark = [pytest.mark.single_cpu]
3229

3330
tables = pytest.importorskip("tables")
3431

@@ -40,7 +37,7 @@ def test_append(setup_path):
4037
# tables.NaturalNameWarning):
4138
df = DataFrame(
4239
np.random.default_rng(2).standard_normal((20, 4)),
43-
columns=Index(list("ABCD"), dtype=object),
40+
columns=Index(list("ABCD")),
4441
index=date_range("2000-01-01", periods=20, freq="B"),
4542
)
4643
_maybe_remove(store, "df1")
@@ -201,7 +198,7 @@ def test_append_some_nans(setup_path):
201198
tm.assert_frame_equal(store["df3"], df3, check_index_type=True)
202199

203200

204-
def test_append_all_nans(setup_path):
201+
def test_append_all_nans(setup_path, using_infer_string):
205202
with ensure_clean_store(setup_path) as store:
206203
df = DataFrame(
207204
{
@@ -253,7 +250,13 @@ def test_append_all_nans(setup_path):
253250
_maybe_remove(store, "df")
254251
store.append("df", df[:10], dropna=True)
255252
store.append("df", df[10:], dropna=True)
256-
tm.assert_frame_equal(store["df"], df, check_index_type=True)
253+
result = store["df"]
254+
expected = df
255+
if using_infer_string:
256+
# TODO: Test is incorrect when not using_infer_string.
257+
# Should take the last 4 rows uncondiationally.
258+
expected = expected[-4:]
259+
tm.assert_frame_equal(result, expected, check_index_type=True)
257260

258261
_maybe_remove(store, "df2")
259262
store.append("df2", df[:10], dropna=False)
@@ -292,7 +295,7 @@ def test_append_frame_column_oriented(setup_path):
292295
# column oriented
293296
df = DataFrame(
294297
np.random.default_rng(2).standard_normal((10, 4)),
295-
columns=Index(list("ABCD"), dtype=object),
298+
columns=Index(list("ABCD")),
296299
index=date_range("2000-01-01", periods=10, freq="B"),
297300
)
298301
df.index = df.index._with_freq(None) # freq doesn't round-trip
@@ -417,7 +420,7 @@ def check_col(key, name, size):
417420
{
418421
"A": [0.0, 1.0, 2.0, 3.0, 4.0],
419422
"B": [0.0, 1.0, 0.0, 1.0, 0.0],
420-
"C": Index(["foo1", "foo2", "foo3", "foo4", "foo5"], dtype=object),
423+
"C": Index(["foo1", "foo2", "foo3", "foo4", "foo5"]),
421424
"D": date_range("20130101", periods=5),
422425
}
423426
).set_index("C")
@@ -444,7 +447,7 @@ def check_col(key, name, size):
444447
_maybe_remove(store, "df")
445448
df = DataFrame(
446449
np.random.default_rng(2).standard_normal((10, 4)),
447-
columns=Index(list("ABCD"), dtype=object),
450+
columns=Index(list("ABCD")),
448451
index=date_range("2000-01-01", periods=10, freq="B"),
449452
)
450453
df["string"] = "foo"
@@ -504,11 +507,12 @@ def test_append_with_empty_string(setup_path):
504507
tm.assert_frame_equal(store.select("df"), df)
505508

506509

510+
@pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)")
507511
def test_append_with_data_columns(setup_path):
508512
with ensure_clean_store(setup_path) as store:
509513
df = DataFrame(
510514
np.random.default_rng(2).standard_normal((10, 4)),
511-
columns=Index(list("ABCD"), dtype=object),
515+
columns=Index(list("ABCD")),
512516
index=date_range("2000-01-01", periods=10, freq="B"),
513517
)
514518
df.iloc[0, df.columns.get_loc("B")] = 1.0
@@ -684,8 +688,8 @@ def test_append_misc(setup_path):
684688
with ensure_clean_store(setup_path) as store:
685689
df = DataFrame(
686690
1.1 * np.arange(120).reshape((30, 4)),
687-
columns=Index(list("ABCD"), dtype=object),
688-
index=Index([f"i-{i}" for i in range(30)], dtype=object),
691+
columns=Index(list("ABCD")),
692+
index=Index([f"i-{i}" for i in range(30)]),
689693
)
690694
store.append("df", df, chunksize=1)
691695
result = store.select("df")
@@ -701,8 +705,8 @@ def test_append_misc_chunksize(setup_path, chunksize):
701705
# more chunksize in append tests
702706
df = DataFrame(
703707
1.1 * np.arange(120).reshape((30, 4)),
704-
columns=Index(list("ABCD"), dtype=object),
705-
index=Index([f"i-{i}" for i in range(30)], dtype=object),
708+
columns=Index(list("ABCD")),
709+
index=Index([f"i-{i}" for i in range(30)]),
706710
)
707711
df["string"] = "foo"
708712
df["float322"] = 1.0
@@ -742,15 +746,15 @@ def test_append_misc_empty_frame(setup_path):
742746
# the conversion from AM->BM converts the invalid object dtype column into
743747
# a datetime64 column no longer raising an error
744748
@td.skip_array_manager_not_yet_implemented
745-
def test_append_raise(setup_path):
749+
def test_append_raise(setup_path, using_infer_string):
746750
with ensure_clean_store(setup_path) as store:
747751
# test append with invalid input to get good error messages
748752

749753
# list in column
750754
df = DataFrame(
751755
1.1 * np.arange(120).reshape((30, 4)),
752-
columns=Index(list("ABCD"), dtype=object),
753-
index=Index([f"i-{i}" for i in range(30)], dtype=object),
756+
columns=Index(list("ABCD")),
757+
index=Index([f"i-{i}" for i in range(30)]),
754758
)
755759
df["invalid"] = [["a"]] * len(df)
756760
assert df.dtypes["invalid"] == np.object_
@@ -770,8 +774,8 @@ def test_append_raise(setup_path):
770774
# datetime with embedded nans as object
771775
df = DataFrame(
772776
1.1 * np.arange(120).reshape((30, 4)),
773-
columns=Index(list("ABCD"), dtype=object),
774-
index=Index([f"i-{i}" for i in range(30)], dtype=object),
777+
columns=Index(list("ABCD")),
778+
index=Index([f"i-{i}" for i in range(30)]),
775779
)
776780
s = Series(datetime.datetime(2001, 1, 2), index=df.index)
777781
s = s.astype(object)
@@ -798,8 +802,8 @@ def test_append_raise(setup_path):
798802
# appending an incompatible table
799803
df = DataFrame(
800804
1.1 * np.arange(120).reshape((30, 4)),
801-
columns=Index(list("ABCD"), dtype=object),
802-
index=Index([f"i-{i}" for i in range(30)], dtype=object),
805+
columns=Index(list("ABCD")),
806+
index=Index([f"i-{i}" for i in range(30)]),
803807
)
804808
store.append("df", df)
805809

@@ -876,7 +880,7 @@ def test_append_with_timedelta(setup_path):
876880
def test_append_to_multiple(setup_path):
877881
df1 = DataFrame(
878882
np.random.default_rng(2).standard_normal((10, 4)),
879-
columns=Index(list("ABCD"), dtype=object),
883+
columns=Index(list("ABCD")),
880884
index=date_range("2000-01-01", periods=10, freq="B"),
881885
)
882886
df2 = df1.copy().rename(columns="{}_2".format)
@@ -913,12 +917,12 @@ def test_append_to_multiple(setup_path):
913917
def test_append_to_multiple_dropna(setup_path):
914918
df1 = DataFrame(
915919
np.random.default_rng(2).standard_normal((10, 4)),
916-
columns=Index(list("ABCD"), dtype=object),
920+
columns=Index(list("ABCD")),
917921
index=date_range("2000-01-01", periods=10, freq="B"),
918922
)
919923
df2 = DataFrame(
920924
np.random.default_rng(2).standard_normal((10, 4)),
921-
columns=Index(list("ABCD"), dtype=object),
925+
columns=Index(list("ABCD")),
922926
index=date_range("2000-01-01", periods=10, freq="B"),
923927
).rename(columns="{}_2".format)
924928
df1.iloc[1, df1.columns.get_indexer(["A", "B"])] = np.nan
@@ -938,7 +942,7 @@ def test_append_to_multiple_dropna(setup_path):
938942
def test_append_to_multiple_dropna_false(setup_path):
939943
df1 = DataFrame(
940944
np.random.default_rng(2).standard_normal((10, 4)),
941-
columns=Index(list("ABCD"), dtype=object),
945+
columns=Index(list("ABCD")),
942946
index=date_range("2000-01-01", periods=10, freq="B"),
943947
)
944948
df2 = df1.copy().rename(columns="{}_2".format)

pandas/tests/io/pytables/test_categorical.py

+2-4
Original file line numberDiff line numberDiff line change
@@ -16,10 +16,7 @@
1616
ensure_clean_store,
1717
)
1818

19-
pytestmark = [
20-
pytest.mark.single_cpu,
21-
pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)", strict=False),
22-
]
19+
pytestmark = [pytest.mark.single_cpu]
2320

2421

2522
def test_categorical(setup_path):
@@ -143,6 +140,7 @@ def test_categorical(setup_path):
143140
store.select("df3/meta/s/meta")
144141

145142

143+
@pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)")
146144
def test_categorical_conversion(tmp_path, setup_path):
147145
# GH13322
148146
# Check that read_hdf with categorical columns doesn't return rows if

pandas/tests/io/pytables/test_complex.py

-6
Original file line numberDiff line numberDiff line change
@@ -1,8 +1,6 @@
11
import numpy as np
22
import pytest
33

4-
from pandas._config import using_string_dtype
5-
64
import pandas as pd
75
from pandas import (
86
DataFrame,
@@ -13,10 +11,6 @@
1311

1412
from pandas.io.pytables import read_hdf
1513

16-
pytestmark = pytest.mark.xfail(
17-
using_string_dtype(), reason="TODO(infer_string)", strict=False
18-
)
19-
2014

2115
def test_complex_fixed(tmp_path, setup_path):
2216
df = DataFrame(

pandas/tests/io/pytables/test_errors.py

+9-9
Original file line numberDiff line numberDiff line change
@@ -5,8 +5,6 @@
55
import numpy as np
66
import pytest
77

8-
from pandas._config import using_string_dtype
9-
108
from pandas import (
119
CategoricalIndex,
1210
DataFrame,
@@ -24,10 +22,7 @@
2422
_maybe_adjust_name,
2523
)
2624

27-
pytestmark = [
28-
pytest.mark.single_cpu,
29-
pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)", strict=False),
30-
]
25+
pytestmark = [pytest.mark.single_cpu]
3126

3227

3328
def test_pass_spec_to_storer(setup_path):
@@ -93,9 +88,14 @@ def test_unimplemented_dtypes_table_columns(setup_path):
9388

9489
with ensure_clean_store(setup_path) as store:
9590
# this fails because we have a date in the object block......
96-
msg = re.escape(
97-
"""Cannot serialize the column [datetime1]
98-
because its data contents are not [string] but [date] object dtype"""
91+
msg = "|".join(
92+
[
93+
re.escape(
94+
"Cannot serialize the column [datetime1]\nbecause its data "
95+
"contents are not [string] but [date] object dtype"
96+
),
97+
re.escape("[date] is not implemented as a table column"),
98+
]
9999
)
100100
with pytest.raises(TypeError, match=msg):
101101
store.append("df_unimplemented", df)

pandas/tests/io/pytables/test_file_handling.py

+2-8
Original file line numberDiff line numberDiff line change
@@ -3,8 +3,6 @@
33
import numpy as np
44
import pytest
55

6-
from pandas._config import using_string_dtype
7-
86
from pandas.compat import (
97
PY311,
108
is_ci_environment,
@@ -34,9 +32,7 @@
3432
from pandas.io import pytables
3533
from pandas.io.pytables import Term
3634

37-
pytestmark = [
38-
pytest.mark.single_cpu,
39-
]
35+
pytestmark = [pytest.mark.single_cpu]
4036

4137

4238
@pytest.mark.parametrize("mode", ["r", "r+", "a", "w"])
@@ -323,7 +319,6 @@ def test_complibs(tmp_path, lvl, lib, request):
323319
assert node.filters.complib == lib
324320

325321

326-
@pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)", strict=False)
327322
@pytest.mark.skipif(
328323
not is_platform_little_endian(), reason="reason platform is not little endian"
329324
)
@@ -341,7 +336,6 @@ def test_encoding(setup_path):
341336
tm.assert_frame_equal(result, expected)
342337

343338

344-
@pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)", strict=False)
345339
@pytest.mark.parametrize(
346340
"val",
347341
[
@@ -356,7 +350,7 @@ def test_encoding(setup_path):
356350
[b"A\xf8\xfc", np.nan, b"", b"b", b"c"],
357351
],
358352
)
359-
@pytest.mark.parametrize("dtype", ["category", object])
353+
@pytest.mark.parametrize("dtype", ["category", None])
360354
def test_latin_encoding(tmp_path, setup_path, dtype, val):
361355
enc = "latin-1"
362356
nan_rep = ""

pandas/tests/io/pytables/test_keys.py

+1-6
Original file line numberDiff line numberDiff line change
@@ -1,8 +1,6 @@
11
import numpy as np
22
import pytest
33

4-
from pandas._config import using_string_dtype
5-
64
from pandas import (
75
DataFrame,
86
HDFStore,
@@ -15,10 +13,7 @@
1513
tables,
1614
)
1715

18-
pytestmark = [
19-
pytest.mark.single_cpu,
20-
pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)", strict=False),
21-
]
16+
pytestmark = [pytest.mark.single_cpu]
2217

2318

2419
def test_keys(setup_path):

pandas/tests/io/pytables/test_put.py

+1-3
Original file line numberDiff line numberDiff line change
@@ -22,9 +22,7 @@
2222
)
2323
from pandas.util import _test_decorators as td
2424

25-
pytestmark = [
26-
pytest.mark.single_cpu,
27-
]
25+
pytestmark = [pytest.mark.single_cpu]
2826

2927

3028
def test_format_type(tmp_path, setup_path):

pandas/tests/io/pytables/test_read.py

+10-6
Original file line numberDiff line numberDiff line change
@@ -28,10 +28,7 @@
2828

2929
from pandas.io.pytables import TableIterator
3030

31-
pytestmark = [
32-
pytest.mark.single_cpu,
33-
pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)", strict=False),
34-
]
31+
pytestmark = [pytest.mark.single_cpu]
3532

3633

3734
def test_read_missing_key_close_store(tmp_path, setup_path):
@@ -77,10 +74,11 @@ def test_read_missing_key_opened_store(tmp_path, setup_path):
7774
read_hdf(store, "k1")
7875

7976

77+
@pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)")
8078
def test_read_column(setup_path):
8179
df = DataFrame(
8280
np.random.default_rng(2).standard_normal((10, 4)),
83-
columns=Index(list("ABCD"), dtype=object),
81+
columns=Index(list("ABCD")),
8482
index=date_range("2000-01-01", periods=10, freq="B"),
8583
)
8684

@@ -221,7 +219,7 @@ def test_legacy_table_read_py2(datapath):
221219
tm.assert_frame_equal(expected, result)
222220

223221

224-
def test_read_hdf_open_store(tmp_path, setup_path):
222+
def test_read_hdf_open_store(tmp_path, setup_path, using_infer_string):
225223
# GH10330
226224
# No check for non-string path_or-buf, and no test of open store
227225
df = DataFrame(
@@ -233,6 +231,12 @@ def test_read_hdf_open_store(tmp_path, setup_path):
233231
df = df.set_index(keys="E", append=True)
234232

235233
path = tmp_path / setup_path
234+
if using_infer_string:
235+
# TODO(infer_string) make this work for string dtype
236+
msg = "Saving a MultiIndex with an extension dtype is not supported."
237+
with pytest.raises(NotImplementedError, match=msg):
238+
df.to_hdf(path, key="df", mode="w")
239+
return
236240
df.to_hdf(path, key="df", mode="w")
237241
direct = read_hdf(path, "df")
238242
with HDFStore(path, mode="r") as store:

0 commit comments

Comments
 (0)