Backport PR #60795: TST(string dtype): Resolve xfails in pytables (#60916)

rhshadrach · JakeTT404 · web-flow · commit 3143f441e2a5 · 2025-02-16T09:39:03.000-08:00
* ENH: Improved error message and raise new error for small-string NaN edge case in HDFStore.append (#60829) * Add clearer error messages for datatype mismatch in HDFStore.append. Raise ValueError when nan_rep too large for pytable column. Add and modify applicable test code. * Fix missed tests and correct mistake in error message. * Remove excess comments. Reverse error type change to avoid api changes. Move nan_rep tests into separate function. (cherry picked from commit 57340ec) * TST(string dtype): Resolve xfails in pytables (#60795) (cherry picked from commit 4511251) * Adjust test --------- Co-authored-by: Jake Thomas Trevallion <136272202+JakeTT404@users.noreply.github.com>
diff --git a/pandas/io/pytables.py b/pandas/io/pytables.py
@@ -5093,6 +5093,9 @@ def _maybe_convert_for_string_atom(
     errors,
     columns: list[str],
 ):
+    if isinstance(bvalues.dtype, StringDtype):
+        # "ndarray[Any, Any]" has no attribute "to_numpy"
+        bvalues = bvalues.to_numpy()  # type: ignore[union-attr]
     if bvalues.dtype != object:
         return bvalues
 
diff --git a/pandas/tests/io/pytables/test_append.py b/pandas/tests/io/pytables/test_append.py
@@ -25,10 +25,7 @@
     ensure_clean_store,
 )
 
-pytestmark = [
-    pytest.mark.single_cpu,
-    pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)", strict=False),
-]
+pytestmark = [pytest.mark.single_cpu]
 
 tables = pytest.importorskip("tables")
 
@@ -40,7 +37,7 @@ def test_append(setup_path):
         # tables.NaturalNameWarning):
         df = DataFrame(
             np.random.default_rng(2).standard_normal((20, 4)),
-            columns=Index(list("ABCD"), dtype=object),
+            columns=Index(list("ABCD")),
             index=date_range("2000-01-01", periods=20, freq="B"),
         )
         _maybe_remove(store, "df1")
@@ -201,7 +198,7 @@ def test_append_some_nans(setup_path):
         tm.assert_frame_equal(store["df3"], df3, check_index_type=True)
 
 
-def test_append_all_nans(setup_path):
+def test_append_all_nans(setup_path, using_infer_string):
     with ensure_clean_store(setup_path) as store:
         df = DataFrame(
             {
@@ -253,7 +250,13 @@ def test_append_all_nans(setup_path):
             _maybe_remove(store, "df")
             store.append("df", df[:10], dropna=True)
             store.append("df", df[10:], dropna=True)
-            tm.assert_frame_equal(store["df"], df, check_index_type=True)
+            result = store["df"]
+            expected = df
+            if using_infer_string:
+                # TODO: Test is incorrect when not using_infer_string.
+                #       Should take the last 4 rows uncondiationally.
+                expected = expected[-4:]
+            tm.assert_frame_equal(result, expected, check_index_type=True)
 
             _maybe_remove(store, "df2")
             store.append("df2", df[:10], dropna=False)
@@ -292,7 +295,7 @@ def test_append_frame_column_oriented(setup_path):
         # column oriented
         df = DataFrame(
             np.random.default_rng(2).standard_normal((10, 4)),
-            columns=Index(list("ABCD"), dtype=object),
+            columns=Index(list("ABCD")),
             index=date_range("2000-01-01", periods=10, freq="B"),
         )
         df.index = df.index._with_freq(None)  # freq doesn't round-trip
@@ -417,7 +420,7 @@ def check_col(key, name, size):
             {
                 "A": [0.0, 1.0, 2.0, 3.0, 4.0],
                 "B": [0.0, 1.0, 0.0, 1.0, 0.0],
-                "C": Index(["foo1", "foo2", "foo3", "foo4", "foo5"], dtype=object),
+                "C": Index(["foo1", "foo2", "foo3", "foo4", "foo5"]),
                 "D": date_range("20130101", periods=5),
             }
         ).set_index("C")
@@ -444,7 +447,7 @@ def check_col(key, name, size):
         _maybe_remove(store, "df")
         df = DataFrame(
             np.random.default_rng(2).standard_normal((10, 4)),
-            columns=Index(list("ABCD"), dtype=object),
+            columns=Index(list("ABCD")),
             index=date_range("2000-01-01", periods=10, freq="B"),
         )
         df["string"] = "foo"
@@ -504,11 +507,12 @@ def test_append_with_empty_string(setup_path):
         tm.assert_frame_equal(store.select("df"), df)
 
 
+@pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)")
 def test_append_with_data_columns(setup_path):
     with ensure_clean_store(setup_path) as store:
         df = DataFrame(
             np.random.default_rng(2).standard_normal((10, 4)),
-            columns=Index(list("ABCD"), dtype=object),
+            columns=Index(list("ABCD")),
             index=date_range("2000-01-01", periods=10, freq="B"),
         )
         df.iloc[0, df.columns.get_loc("B")] = 1.0
@@ -684,8 +688,8 @@ def test_append_misc(setup_path):
     with ensure_clean_store(setup_path) as store:
         df = DataFrame(
             1.1 * np.arange(120).reshape((30, 4)),
-            columns=Index(list("ABCD"), dtype=object),
-            index=Index([f"i-{i}" for i in range(30)], dtype=object),
+            columns=Index(list("ABCD")),
+            index=Index([f"i-{i}" for i in range(30)]),
         )
         store.append("df", df, chunksize=1)
         result = store.select("df")
@@ -701,8 +705,8 @@ def test_append_misc_chunksize(setup_path, chunksize):
     # more chunksize in append tests
     df = DataFrame(
         1.1 * np.arange(120).reshape((30, 4)),
-        columns=Index(list("ABCD"), dtype=object),
-        index=Index([f"i-{i}" for i in range(30)], dtype=object),
+        columns=Index(list("ABCD")),
+        index=Index([f"i-{i}" for i in range(30)]),
     )
     df["string"] = "foo"
     df["float322"] = 1.0
@@ -742,15 +746,15 @@ def test_append_misc_empty_frame(setup_path):
 # the conversion from AM->BM converts the invalid object dtype column into
 # a datetime64 column no longer raising an error
 @td.skip_array_manager_not_yet_implemented
-def test_append_raise(setup_path):
+def test_append_raise(setup_path, using_infer_string):
     with ensure_clean_store(setup_path) as store:
         # test append with invalid input to get good error messages
 
         # list in column
         df = DataFrame(
             1.1 * np.arange(120).reshape((30, 4)),
-            columns=Index(list("ABCD"), dtype=object),
-            index=Index([f"i-{i}" for i in range(30)], dtype=object),
+            columns=Index(list("ABCD")),
+            index=Index([f"i-{i}" for i in range(30)]),
         )
         df["invalid"] = [["a"]] * len(df)
         assert df.dtypes["invalid"] == np.object_
@@ -770,8 +774,8 @@ def test_append_raise(setup_path):
         # datetime with embedded nans as object
         df = DataFrame(
             1.1 * np.arange(120).reshape((30, 4)),
-            columns=Index(list("ABCD"), dtype=object),
-            index=Index([f"i-{i}" for i in range(30)], dtype=object),
+            columns=Index(list("ABCD")),
+            index=Index([f"i-{i}" for i in range(30)]),
         )
         s = Series(datetime.datetime(2001, 1, 2), index=df.index)
         s = s.astype(object)
@@ -798,8 +802,8 @@ def test_append_raise(setup_path):
         # appending an incompatible table
         df = DataFrame(
             1.1 * np.arange(120).reshape((30, 4)),
-            columns=Index(list("ABCD"), dtype=object),
-            index=Index([f"i-{i}" for i in range(30)], dtype=object),
+            columns=Index(list("ABCD")),
+            index=Index([f"i-{i}" for i in range(30)]),
         )
         store.append("df", df)
 
@@ -876,7 +880,7 @@ def test_append_with_timedelta(setup_path):
 def test_append_to_multiple(setup_path):
     df1 = DataFrame(
         np.random.default_rng(2).standard_normal((10, 4)),
-        columns=Index(list("ABCD"), dtype=object),
+        columns=Index(list("ABCD")),
         index=date_range("2000-01-01", periods=10, freq="B"),
     )
     df2 = df1.copy().rename(columns="{}_2".format)
@@ -913,12 +917,12 @@ def test_append_to_multiple(setup_path):
 def test_append_to_multiple_dropna(setup_path):
     df1 = DataFrame(
         np.random.default_rng(2).standard_normal((10, 4)),
-        columns=Index(list("ABCD"), dtype=object),
+        columns=Index(list("ABCD")),
         index=date_range("2000-01-01", periods=10, freq="B"),
     )
     df2 = DataFrame(
         np.random.default_rng(2).standard_normal((10, 4)),
-        columns=Index(list("ABCD"), dtype=object),
+        columns=Index(list("ABCD")),
         index=date_range("2000-01-01", periods=10, freq="B"),
     ).rename(columns="{}_2".format)
     df1.iloc[1, df1.columns.get_indexer(["A", "B"])] = np.nan
@@ -938,7 +942,7 @@ def test_append_to_multiple_dropna(setup_path):
 def test_append_to_multiple_dropna_false(setup_path):
     df1 = DataFrame(
         np.random.default_rng(2).standard_normal((10, 4)),
-        columns=Index(list("ABCD"), dtype=object),
+        columns=Index(list("ABCD")),
         index=date_range("2000-01-01", periods=10, freq="B"),
     )
     df2 = df1.copy().rename(columns="{}_2".format)
diff --git a/pandas/tests/io/pytables/test_categorical.py b/pandas/tests/io/pytables/test_categorical.py
@@ -16,10 +16,7 @@
     ensure_clean_store,
 )
 
-pytestmark = [
-    pytest.mark.single_cpu,
-    pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)", strict=False),
-]
+pytestmark = [pytest.mark.single_cpu]
 
 
 def test_categorical(setup_path):
@@ -143,6 +140,7 @@ def test_categorical(setup_path):
             store.select("df3/meta/s/meta")
 
 
+@pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)")
 def test_categorical_conversion(tmp_path, setup_path):
     # GH13322
     # Check that read_hdf with categorical columns doesn't return rows if
diff --git a/pandas/tests/io/pytables/test_complex.py b/pandas/tests/io/pytables/test_complex.py
@@ -1,8 +1,6 @@
 import numpy as np
 import pytest
 
-from pandas._config import using_string_dtype
-
 import pandas as pd
 from pandas import (
     DataFrame,
@@ -13,10 +11,6 @@
 
 from pandas.io.pytables import read_hdf
 
-pytestmark = pytest.mark.xfail(
-    using_string_dtype(), reason="TODO(infer_string)", strict=False
-)
-
 
 def test_complex_fixed(tmp_path, setup_path):
     df = DataFrame(
diff --git a/pandas/tests/io/pytables/test_errors.py b/pandas/tests/io/pytables/test_errors.py
@@ -5,8 +5,6 @@
 import numpy as np
 import pytest
 
-from pandas._config import using_string_dtype
-
 from pandas import (
     CategoricalIndex,
     DataFrame,
@@ -24,10 +22,7 @@
     _maybe_adjust_name,
 )
 
-pytestmark = [
-    pytest.mark.single_cpu,
-    pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)", strict=False),
-]
+pytestmark = [pytest.mark.single_cpu]
 
 
 def test_pass_spec_to_storer(setup_path):
@@ -93,9 +88,14 @@ def test_unimplemented_dtypes_table_columns(setup_path):
 
     with ensure_clean_store(setup_path) as store:
         # this fails because we have a date in the object block......
-        msg = re.escape(
-            """Cannot serialize the column [datetime1]
-because its data contents are not [string] but [date] object dtype"""
+        msg = "|".join(
+            [
+                re.escape(
+                    "Cannot serialize the column [datetime1]\nbecause its data "
+                    "contents are not [string] but [date] object dtype"
+                ),
+                re.escape("[date] is not implemented as a table column"),
+            ]
         )
         with pytest.raises(TypeError, match=msg):
             store.append("df_unimplemented", df)
diff --git a/pandas/tests/io/pytables/test_file_handling.py b/pandas/tests/io/pytables/test_file_handling.py
@@ -3,8 +3,6 @@
 import numpy as np
 import pytest
 
-from pandas._config import using_string_dtype
-
 from pandas.compat import (
     PY311,
     is_ci_environment,
@@ -34,9 +32,7 @@
 from pandas.io import pytables
 from pandas.io.pytables import Term
 
-pytestmark = [
-    pytest.mark.single_cpu,
-]
+pytestmark = [pytest.mark.single_cpu]
 
 
 @pytest.mark.parametrize("mode", ["r", "r+", "a", "w"])
@@ -323,7 +319,6 @@ def test_complibs(tmp_path, lvl, lib, request):
                 assert node.filters.complib == lib
 
 
-@pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)", strict=False)
 @pytest.mark.skipif(
     not is_platform_little_endian(), reason="reason platform is not little endian"
 )
@@ -341,7 +336,6 @@ def test_encoding(setup_path):
         tm.assert_frame_equal(result, expected)
 
 
-@pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)", strict=False)
 @pytest.mark.parametrize(
     "val",
     [
@@ -356,7 +350,7 @@ def test_encoding(setup_path):
         [b"A\xf8\xfc", np.nan, b"", b"b", b"c"],
     ],
 )
-@pytest.mark.parametrize("dtype", ["category", object])
+@pytest.mark.parametrize("dtype", ["category", None])
 def test_latin_encoding(tmp_path, setup_path, dtype, val):
     enc = "latin-1"
     nan_rep = ""
diff --git a/pandas/tests/io/pytables/test_keys.py b/pandas/tests/io/pytables/test_keys.py
@@ -1,8 +1,6 @@
 import numpy as np
 import pytest
 
-from pandas._config import using_string_dtype
-
 from pandas import (
     DataFrame,
     HDFStore,
@@ -15,10 +13,7 @@
     tables,
 )
 
-pytestmark = [
-    pytest.mark.single_cpu,
-    pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)", strict=False),
-]
+pytestmark = [pytest.mark.single_cpu]
 
 
 def test_keys(setup_path):
diff --git a/pandas/tests/io/pytables/test_put.py b/pandas/tests/io/pytables/test_put.py
@@ -22,9 +22,7 @@
 )
 from pandas.util import _test_decorators as td
 
-pytestmark = [
-    pytest.mark.single_cpu,
-]
+pytestmark = [pytest.mark.single_cpu]
 
 
 def test_format_type(tmp_path, setup_path):
diff --git a/pandas/tests/io/pytables/test_read.py b/pandas/tests/io/pytables/test_read.py
@@ -28,10 +28,7 @@
 
 from pandas.io.pytables import TableIterator
 
-pytestmark = [
-    pytest.mark.single_cpu,
-    pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)", strict=False),
-]
+pytestmark = [pytest.mark.single_cpu]
 
 
 def test_read_missing_key_close_store(tmp_path, setup_path):
@@ -77,10 +74,11 @@ def test_read_missing_key_opened_store(tmp_path, setup_path):
         read_hdf(store, "k1")
 
 
+@pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)")
 def test_read_column(setup_path):
     df = DataFrame(
         np.random.default_rng(2).standard_normal((10, 4)),
-        columns=Index(list("ABCD"), dtype=object),
+        columns=Index(list("ABCD")),
         index=date_range("2000-01-01", periods=10, freq="B"),
     )
 
@@ -221,7 +219,7 @@ def test_legacy_table_read_py2(datapath):
     tm.assert_frame_equal(expected, result)
 
 
-def test_read_hdf_open_store(tmp_path, setup_path):
+def test_read_hdf_open_store(tmp_path, setup_path, using_infer_string):
     # GH10330
     # No check for non-string path_or-buf, and no test of open store
     df = DataFrame(
@@ -233,6 +231,12 @@ def test_read_hdf_open_store(tmp_path, setup_path):
     df = df.set_index(keys="E", append=True)
 
     path = tmp_path / setup_path
+    if using_infer_string:
+        # TODO(infer_string) make this work for string dtype
+        msg = "Saving a MultiIndex with an extension dtype is not supported."
+        with pytest.raises(NotImplementedError, match=msg):
+            df.to_hdf(path, key="df", mode="w")
+        return
     df.to_hdf(path, key="df", mode="w")
     direct = read_hdf(path, "df")
     with HDFStore(path, mode="r") as store:
diff --git a/pandas/tests/io/pytables/test_round_trip.py b/pandas/tests/io/pytables/test_round_trip.py
diff --git a/pandas/tests/io/pytables/test_select.py b/pandas/tests/io/pytables/test_select.py
diff --git a/pandas/tests/io/pytables/test_store.py b/pandas/tests/io/pytables/test_store.py
diff --git a/pandas/tests/io/pytables/test_timezones.py b/pandas/tests/io/pytables/test_timezones.py

Original file line number	Diff line number	Diff line change
`@@ -22,9 +22,7 @@`
`22`	`22`	`)`
`23`	`23`	`from pandas.util import _test_decorators as td`
`24`	`24`
`25`		`-pytestmark = [`
`26`		`- pytest.mark.single_cpu,`
`27`		`-]`
	`25`	`+pytestmark = [pytest.mark.single_cpu]`
`28`	`26`
`29`	`27`
`30`	`28`	`def test_format_type(tmp_path, setup_path):`