Skip to content

Commit 97a06de

Browse files
authored
Backport PR #60709: ENH(string dtype): Make str.decode return str dtype (#60821)
1 parent 04c3e81 commit 97a06de

File tree

7 files changed

+29
-19
lines changed

7 files changed

+29
-19
lines changed

.github/actions/run-tests/action.yml

+1-1
Original file line numberDiff line numberDiff line change
@@ -7,7 +7,7 @@ runs:
77
shell: bash -el {0}
88

99
- name: Publish test results
10-
uses: actions/upload-artifact@v3
10+
uses: actions/upload-artifact@v4
1111
with:
1212
name: Test results
1313
path: test-data.xml

doc/source/whatsnew/v2.3.0.rst

+1
Original file line numberDiff line numberDiff line change
@@ -35,6 +35,7 @@ Other enhancements
3535
- The semantics for the ``copy`` keyword in ``__array__`` methods (i.e. called
3636
when using ``np.array()`` or ``np.asarray()`` on pandas objects) has been
3737
updated to raise FutureWarning with NumPy >= 2 (:issue:`60340`)
38+
- :meth:`Series.str.decode` result now has ``StringDtype`` when ``future.infer_string`` is True (:issue:`60709`)
3839
- :meth:`~Series.to_hdf` and :meth:`~DataFrame.to_hdf` now round-trip with ``StringDtype`` (:issue:`60663`)
3940
- The :meth:`~Series.cumsum`, :meth:`~Series.cummin`, and :meth:`~Series.cummax` reductions are now implemented for ``StringDtype`` columns when backed by PyArrow (:issue:`60633`)
4041
- The :meth:`~Series.sum` reduction is now implemented for ``StringDtype`` columns (:issue:`59853`)

pandas/core/strings/accessor.py

+7-3
Original file line numberDiff line numberDiff line change
@@ -13,6 +13,8 @@
1313

1414
import numpy as np
1515

16+
from pandas._config import get_option
17+
1618
from pandas._libs import lib
1719
from pandas._typing import (
1820
AlignJoin,
@@ -387,7 +389,9 @@ def cons_row(x):
387389
# This is a mess.
388390
_dtype: DtypeObj | str | None = dtype
389391
vdtype = getattr(result, "dtype", None)
390-
if self._is_string:
392+
if _dtype is not None:
393+
pass
394+
elif self._is_string:
391395
if is_bool_dtype(vdtype):
392396
_dtype = result.dtype
393397
elif returns_string:
@@ -2012,9 +2016,9 @@ def decode(self, encoding, errors: str = "strict"):
20122016
decoder = codecs.getdecoder(encoding)
20132017
f = lambda x: decoder(x, errors)[0]
20142018
arr = self._data.array
2015-
# assert isinstance(arr, (StringArray,))
20162019
result = arr._str_map(f)
2017-
return self._wrap_result(result)
2020+
dtype = "str" if get_option("future.infer_string") else None
2021+
return self._wrap_result(result, dtype=dtype)
20182022

20192023
@forbid_nonstring_types(["bytes"])
20202024
def encode(self, encoding, errors: str = "strict"):

pandas/io/pytables.py

+3-1
Original file line numberDiff line numberDiff line change
@@ -5208,7 +5208,9 @@ def _unconvert_string_array(
52085208
dtype = f"U{itemsize}"
52095209

52105210
if isinstance(data[0], bytes):
5211-
data = Series(data, copy=False).str.decode(encoding, errors=errors)._values
5211+
ser = Series(data, copy=False).str.decode(encoding, errors=errors)
5212+
data = ser.to_numpy()
5213+
data.flags.writeable = True
52125214
else:
52135215
data = data.astype(dtype, copy=False).astype(object, copy=False)
52145216

pandas/io/sas/sas7bdat.py

+6
Original file line numberDiff line numberDiff line change
@@ -25,6 +25,8 @@
2525

2626
import numpy as np
2727

28+
from pandas._config import get_option
29+
2830
from pandas._libs.byteswap import (
2931
read_double_with_byteswap,
3032
read_float_with_byteswap,
@@ -722,6 +724,7 @@ def _chunk_to_dataframe(self) -> DataFrame:
722724
rslt = {}
723725

724726
js, jb = 0, 0
727+
infer_string = get_option("future.infer_string")
725728
for j in range(self.column_count):
726729
name = self.column_names[j]
727730

@@ -738,6 +741,9 @@ def _chunk_to_dataframe(self) -> DataFrame:
738741
rslt[name] = pd.Series(self._string_chunk[js, :], index=ix, copy=False)
739742
if self.convert_text and (self.encoding is not None):
740743
rslt[name] = self._decode_string(rslt[name].str)
744+
if infer_string:
745+
rslt[name] = rslt[name].astype("str")
746+
741747
js += 1
742748
else:
743749
self.close()

pandas/tests/io/sas/test_sas7bdat.py

+6-10
Original file line numberDiff line numberDiff line change
@@ -7,8 +7,6 @@
77
import numpy as np
88
import pytest
99

10-
from pandas._config import using_string_dtype
11-
1210
from pandas.compat import IS64
1311
from pandas.errors import EmptyDataError
1412
import pandas.util._test_decorators as td
@@ -18,10 +16,6 @@
1816

1917
from pandas.io.sas.sas7bdat import SAS7BDATReader
2018

21-
pytestmark = pytest.mark.xfail(
22-
using_string_dtype(), reason="TODO(infer_string)", strict=False
23-
)
24-
2519

2620
@pytest.fixture
2721
def dirpath(datapath):
@@ -254,11 +248,13 @@ def test_zero_variables(datapath):
254248
pd.read_sas(fname)
255249

256250

257-
def test_zero_rows(datapath):
251+
@pytest.mark.parametrize("encoding", [None, "utf8"])
252+
def test_zero_rows(datapath, encoding):
258253
# GH 18198
259254
fname = datapath("io", "sas", "data", "zero_rows.sas7bdat")
260-
result = pd.read_sas(fname)
261-
expected = pd.DataFrame([{"char_field": "a", "num_field": 1.0}]).iloc[:0]
255+
result = pd.read_sas(fname, encoding=encoding)
256+
str_value = b"a" if encoding is None else "a"
257+
expected = pd.DataFrame([{"char_field": str_value, "num_field": 1.0}]).iloc[:0]
262258
tm.assert_frame_equal(result, expected)
263259

264260

@@ -414,7 +410,7 @@ def test_0x40_control_byte(datapath):
414410
fname = datapath("io", "sas", "data", "0x40controlbyte.sas7bdat")
415411
df = pd.read_sas(fname, encoding="ascii")
416412
fname = datapath("io", "sas", "data", "0x40controlbyte.csv")
417-
df0 = pd.read_csv(fname, dtype="object")
413+
df0 = pd.read_csv(fname, dtype="str")
418414
tm.assert_frame_equal(df, df0)
419415

420416

pandas/tests/strings/test_strings.py

+5-4
Original file line numberDiff line numberDiff line change
@@ -95,6 +95,7 @@ def test_repeat_with_null(any_string_dtype, arg, repeat):
9595

9696
def test_empty_str_methods(any_string_dtype):
9797
empty_str = empty = Series(dtype=any_string_dtype)
98+
empty_inferred_str = Series(dtype="str")
9899
if is_object_or_nan_string_dtype(any_string_dtype):
99100
empty_int = Series(dtype="int64")
100101
empty_bool = Series(dtype=bool)
@@ -154,7 +155,7 @@ def test_empty_str_methods(any_string_dtype):
154155
tm.assert_series_equal(empty_str, empty.str.rstrip())
155156
tm.assert_series_equal(empty_str, empty.str.wrap(42))
156157
tm.assert_series_equal(empty_str, empty.str.get(0))
157-
tm.assert_series_equal(empty_object, empty_bytes.str.decode("ascii"))
158+
tm.assert_series_equal(empty_inferred_str, empty_bytes.str.decode("ascii"))
158159
tm.assert_series_equal(empty_bytes, empty.str.encode("ascii"))
159160
# ismethods should always return boolean (GH 29624)
160161
tm.assert_series_equal(empty_bool, empty.str.isalnum())
@@ -564,7 +565,7 @@ def test_string_slice_out_of_bounds(any_string_dtype):
564565
def test_encode_decode(any_string_dtype):
565566
ser = Series(["a", "b", "a\xe4"], dtype=any_string_dtype).str.encode("utf-8")
566567
result = ser.str.decode("utf-8")
567-
expected = ser.map(lambda x: x.decode("utf-8")).astype(object)
568+
expected = Series(["a", "b", "a\xe4"], dtype="str")
568569
tm.assert_series_equal(result, expected)
569570

570571

@@ -594,7 +595,7 @@ def test_decode_errors_kwarg():
594595
ser.str.decode("cp1252")
595596

596597
result = ser.str.decode("cp1252", "ignore")
597-
expected = ser.map(lambda x: x.decode("cp1252", "ignore")).astype(object)
598+
expected = ser.map(lambda x: x.decode("cp1252", "ignore")).astype("str")
598599
tm.assert_series_equal(result, expected)
599600

600601

@@ -749,5 +750,5 @@ def test_get_with_dict_label():
749750
def test_series_str_decode():
750751
# GH 22613
751752
result = Series([b"x", b"y"]).str.decode(encoding="UTF-8", errors="strict")
752-
expected = Series(["x", "y"], dtype="object")
753+
expected = Series(["x", "y"], dtype="str")
753754
tm.assert_series_equal(result, expected)

0 commit comments

Comments
 (0)