Merge branch 'main' of https://github.com/pandas-dev/pandas into str_xfail_pytable_misc

rhshadrach · rhshadrach · commit 55ea98ba35b2 · 2025-01-30T16:22:15.000-05:00
diff --git a/doc/source/whatsnew/v2.3.0.rst b/doc/source/whatsnew/v2.3.0.rst
@@ -35,6 +35,7 @@ Other enhancements
 - The semantics for the ``copy`` keyword in ``__array__`` methods (i.e. called
   when using ``np.array()`` or ``np.asarray()`` on pandas objects) has been
   updated to work correctly with NumPy >= 2 (:issue:`57739`)
+- :meth:`Series.str.decode` result now has ``StringDtype`` when ``future.infer_string`` is True (:issue:`60709`)
 - :meth:`~Series.to_hdf` and :meth:`~DataFrame.to_hdf` now round-trip with ``StringDtype``  (:issue:`60663`)
 - The :meth:`~Series.cumsum`, :meth:`~Series.cummin`, and :meth:`~Series.cummax` reductions are now implemented for ``StringDtype`` columns when backed by PyArrow (:issue:`60633`)
 - The :meth:`~Series.sum` reduction is now implemented for ``StringDtype`` columns (:issue:`59853`)
diff --git a/pandas/core/strings/accessor.py b/pandas/core/strings/accessor.py
@@ -12,6 +12,8 @@
 
 import numpy as np
 
+from pandas._config import get_option
+
 from pandas._libs import lib
 from pandas._typing import (
     AlignJoin,
@@ -400,7 +402,9 @@ def cons_row(x):
             # This is a mess.
             _dtype: DtypeObj | str | None = dtype
             vdtype = getattr(result, "dtype", None)
-            if self._is_string:
+            if _dtype is not None:
+                pass
+            elif self._is_string:
                 if is_bool_dtype(vdtype):
                     _dtype = result.dtype
                 elif returns_string:
@@ -2141,9 +2145,9 @@ def decode(self, encoding, errors: str = "strict"):
             decoder = codecs.getdecoder(encoding)
             f = lambda x: decoder(x, errors)[0]
         arr = self._data.array
-        # assert isinstance(arr, (StringArray,))
         result = arr._str_map(f)
-        return self._wrap_result(result)
+        dtype = "str" if get_option("future.infer_string") else None
+        return self._wrap_result(result, dtype=dtype)
 
     @forbid_nonstring_types(["bytes"])
     def encode(self, encoding, errors: str = "strict"):
diff --git a/pandas/io/pytables.py b/pandas/io/pytables.py
@@ -5236,7 +5236,9 @@ def _unconvert_string_array(
         dtype = f"U{itemsize}"
 
         if isinstance(data[0], bytes):
-            data = Series(data, copy=False).str.decode(encoding, errors=errors)._values
+            ser = Series(data, copy=False).str.decode(encoding, errors=errors)
+            data = ser.to_numpy()
+            data.flags.writeable = True
         else:
             data = data.astype(dtype, copy=False).astype(object, copy=False)
 
diff --git a/pandas/io/sas/sas7bdat.py b/pandas/io/sas/sas7bdat.py
@@ -22,6 +22,8 @@
 
 import numpy as np
 
+from pandas._config import get_option
+
 from pandas._libs.byteswap import (
     read_double_with_byteswap,
     read_float_with_byteswap,
@@ -699,6 +701,7 @@ def _chunk_to_dataframe(self) -> DataFrame:
         rslt = {}
 
         js, jb = 0, 0
+        infer_string = get_option("future.infer_string")
         for j in range(self.column_count):
             name = self.column_names[j]
 
@@ -715,6 +718,9 @@ def _chunk_to_dataframe(self) -> DataFrame:
                 rslt[name] = pd.Series(self._string_chunk[js, :], index=ix, copy=False)
                 if self.convert_text and (self.encoding is not None):
                     rslt[name] = self._decode_string(rslt[name].str)
+                    if infer_string:
+                        rslt[name] = rslt[name].astype("str")
+
                 js += 1
             else:
                 self.close()
diff --git a/pandas/tests/frame/methods/test_dropna.py b/pandas/tests/frame/methods/test_dropna.py
@@ -4,8 +4,6 @@
 import numpy as np
 import pytest
 
-from pandas._config import using_string_dtype
-
 import pandas as pd
 from pandas import (
     DataFrame,
@@ -184,10 +182,12 @@ def test_dropna_multiple_axes(self):
         with pytest.raises(TypeError, match="supplying multiple axes"):
             inp.dropna(how="all", axis=(0, 1), inplace=True)
 
-    @pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)")
-    def test_dropna_tz_aware_datetime(self):
+    def test_dropna_tz_aware_datetime(self, using_infer_string):
         # GH13407
+
         df = DataFrame()
+        if using_infer_string:
+            df.columns = df.columns.astype("str")
         dt1 = datetime.datetime(2015, 1, 1, tzinfo=dateutil.tz.tzutc())
         dt2 = datetime.datetime(2015, 2, 2, tzinfo=dateutil.tz.tzutc())
         df["Time"] = [dt1]
diff --git a/pandas/tests/frame/test_arithmetic.py b/pandas/tests/frame/test_arithmetic.py
@@ -11,7 +11,7 @@
 import numpy as np
 import pytest
 
-from pandas._config import using_string_dtype
+from pandas.compat import HAS_PYARROW
 
 import pandas as pd
 from pandas import (
@@ -2126,12 +2126,19 @@ def test_enum_column_equality():
     tm.assert_series_equal(result, expected)
 
 
-@pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)")
-def test_mixed_col_index_dtype():
+def test_mixed_col_index_dtype(using_infer_string):
     # GH 47382
     df1 = DataFrame(columns=list("abc"), data=1.0, index=[0])
     df2 = DataFrame(columns=list("abc"), data=0.0, index=[0])
     df1.columns = df2.columns.astype("string")
     result = df1 + df2
     expected = DataFrame(columns=list("abc"), data=1.0, index=[0])
+    if using_infer_string:
+        # df2.columns.dtype will be "str" instead of object,
+        #  so the aligned result will be "string", not object
+        if HAS_PYARROW:
+            dtype = "string[pyarrow]"
+        else:
+            dtype = "string"
+        expected.columns = expected.columns.astype(dtype)
     tm.assert_frame_equal(result, expected)
diff --git a/pandas/tests/io/sas/test_sas7bdat.py b/pandas/tests/io/sas/test_sas7bdat.py
@@ -7,8 +7,6 @@
 import numpy as np
 import pytest
 
-from pandas._config import using_string_dtype
-
 from pandas.compat._constants import (
     IS64,
     WASM,
@@ -20,10 +18,6 @@
 
 from pandas.io.sas.sas7bdat import SAS7BDATReader
 
-pytestmark = pytest.mark.xfail(
-    using_string_dtype(), reason="TODO(infer_string)", strict=False
-)
-
 
 @pytest.fixture
 def dirpath(datapath):
@@ -246,11 +240,13 @@ def test_zero_variables(datapath):
         pd.read_sas(fname)
 
 
-def test_zero_rows(datapath):
+@pytest.mark.parametrize("encoding", [None, "utf8"])
+def test_zero_rows(datapath, encoding):
     # GH 18198
     fname = datapath("io", "sas", "data", "zero_rows.sas7bdat")
-    result = pd.read_sas(fname)
-    expected = pd.DataFrame([{"char_field": "a", "num_field": 1.0}]).iloc[:0]
+    result = pd.read_sas(fname, encoding=encoding)
+    str_value = b"a" if encoding is None else "a"
+    expected = pd.DataFrame([{"char_field": str_value, "num_field": 1.0}]).iloc[:0]
     tm.assert_frame_equal(result, expected)
 
 
@@ -409,7 +405,7 @@ def test_0x40_control_byte(datapath):
     fname = datapath("io", "sas", "data", "0x40controlbyte.sas7bdat")
     df = pd.read_sas(fname, encoding="ascii")
     fname = datapath("io", "sas", "data", "0x40controlbyte.csv")
-    df0 = pd.read_csv(fname, dtype="object")
+    df0 = pd.read_csv(fname, dtype="str")
     tm.assert_frame_equal(df, df0)
 
 
diff --git a/pandas/tests/reshape/test_from_dummies.py b/pandas/tests/reshape/test_from_dummies.py
@@ -1,8 +1,6 @@
 import numpy as np
 import pytest
 
-from pandas._config import using_string_dtype
-
 from pandas import (
     DataFrame,
     Series,
@@ -364,7 +362,6 @@ def test_with_prefix_contains_get_dummies_NaN_column():
     tm.assert_frame_equal(result, expected)
 
 
-@pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)", strict=False)
 @pytest.mark.parametrize(
     "default_category, expected",
     [
@@ -401,12 +398,14 @@ def test_with_prefix_contains_get_dummies_NaN_column():
     ],
 )
 def test_with_prefix_default_category(
-    dummies_with_unassigned, default_category, expected
+    dummies_with_unassigned, default_category, expected, using_infer_string
 ):
     result = from_dummies(
         dummies_with_unassigned, sep="_", default_category=default_category
     )
     expected = DataFrame(expected)
+    if using_infer_string:
+        expected = expected.astype("str")
     tm.assert_frame_equal(result, expected)
 
 
diff --git a/pandas/tests/strings/test_strings.py b/pandas/tests/strings/test_strings.py
@@ -95,6 +95,7 @@ def test_repeat_with_null(any_string_dtype, arg, repeat):
 
 def test_empty_str_methods(any_string_dtype):
     empty_str = empty = Series(dtype=any_string_dtype)
+    empty_inferred_str = Series(dtype="str")
     if is_object_or_nan_string_dtype(any_string_dtype):
         empty_int = Series(dtype="int64")
         empty_bool = Series(dtype=bool)
@@ -154,7 +155,7 @@ def test_empty_str_methods(any_string_dtype):
     tm.assert_series_equal(empty_str, empty.str.rstrip())
     tm.assert_series_equal(empty_str, empty.str.wrap(42))
     tm.assert_series_equal(empty_str, empty.str.get(0))
-    tm.assert_series_equal(empty_object, empty_bytes.str.decode("ascii"))
+    tm.assert_series_equal(empty_inferred_str, empty_bytes.str.decode("ascii"))
     tm.assert_series_equal(empty_bytes, empty.str.encode("ascii"))
     # ismethods should always return boolean (GH 29624)
     tm.assert_series_equal(empty_bool, empty.str.isalnum())
@@ -566,7 +567,7 @@ def test_string_slice_out_of_bounds(any_string_dtype):
 def test_encode_decode(any_string_dtype):
     ser = Series(["a", "b", "a\xe4"], dtype=any_string_dtype).str.encode("utf-8")
     result = ser.str.decode("utf-8")
-    expected = ser.map(lambda x: x.decode("utf-8")).astype(object)
+    expected = Series(["a", "b", "a\xe4"], dtype="str")
     tm.assert_series_equal(result, expected)
 
 
@@ -596,7 +597,7 @@ def test_decode_errors_kwarg():
         ser.str.decode("cp1252")
 
     result = ser.str.decode("cp1252", "ignore")
-    expected = ser.map(lambda x: x.decode("cp1252", "ignore")).astype(object)
+    expected = ser.map(lambda x: x.decode("cp1252", "ignore")).astype("str")
     tm.assert_series_equal(result, expected)
 
 
@@ -751,5 +752,5 @@ def test_get_with_dict_label():
 def test_series_str_decode():
     # GH 22613
     result = Series([b"x", b"y"]).str.decode(encoding="UTF-8", errors="strict")
-    expected = Series(["x", "y"], dtype="object")
+    expected = Series(["x", "y"], dtype="str")
     tm.assert_series_equal(result, expected)