Skip to content

Commit 9b015c0

Browse files
authored
Merge branch 'main' into issue-37210-to-sql-truncate
2 parents f5bc6ff + 5b2cddb commit 9b015c0

File tree

9 files changed

+57
-78
lines changed

9 files changed

+57
-78
lines changed

doc/source/whatsnew/v2.3.0.rst

+1
Original file line numberDiff line numberDiff line change
@@ -37,6 +37,7 @@ Other enhancements
3737
updated to work correctly with NumPy >= 2 (:issue:`57739`)
3838
- :meth:`Series.str.decode` result now has ``StringDtype`` when ``future.infer_string`` is True (:issue:`60709`)
3939
- :meth:`~Series.to_hdf` and :meth:`~DataFrame.to_hdf` now round-trip with ``StringDtype`` (:issue:`60663`)
40+
- The :meth:`Series.str.decode` has gained the argument ``dtype`` to control the dtype of the result (:issue:`60940`)
4041
- The :meth:`~Series.cumsum`, :meth:`~Series.cummin`, and :meth:`~Series.cummax` reductions are now implemented for ``StringDtype`` columns when backed by PyArrow (:issue:`60633`)
4142
- The :meth:`~Series.sum` reduction is now implemented for ``StringDtype`` columns (:issue:`59853`)
4243

pandas/_libs/algos.pyx

+1-27
Original file line numberDiff line numberDiff line change
@@ -818,33 +818,7 @@ def is_monotonic(const numeric_object_t[:] arr, bint timelike):
818818
if timelike and <int64_t>arr[0] == NPY_NAT:
819819
return False, False, False
820820

821-
if numeric_object_t is not object:
822-
with nogil:
823-
prev = arr[0]
824-
for i in range(1, n):
825-
cur = arr[i]
826-
if timelike and <int64_t>cur == NPY_NAT:
827-
is_monotonic_inc = 0
828-
is_monotonic_dec = 0
829-
break
830-
if cur < prev:
831-
is_monotonic_inc = 0
832-
elif cur > prev:
833-
is_monotonic_dec = 0
834-
elif cur == prev:
835-
is_unique = 0
836-
else:
837-
# cur or prev is NaN
838-
is_monotonic_inc = 0
839-
is_monotonic_dec = 0
840-
break
841-
if not is_monotonic_inc and not is_monotonic_dec:
842-
is_monotonic_inc = 0
843-
is_monotonic_dec = 0
844-
break
845-
prev = cur
846-
else:
847-
# object-dtype, identical to above except we cannot use `with nogil`
821+
with nogil(numeric_object_t is not object):
848822
prev = arr[0]
849823
for i in range(1, n):
850824
cur = arr[i]

pandas/_libs/hashtable_func_helper.pxi.in

+1-14
Original file line numberDiff line numberDiff line change
@@ -415,20 +415,7 @@ def mode(ndarray[htfunc_t] values, bint dropna, const uint8_t[:] mask=None):
415415

416416
modes = np.empty(nkeys, dtype=values.dtype)
417417

418-
if htfunc_t is not object:
419-
with nogil:
420-
for k in range(nkeys):
421-
count = counts[k]
422-
if count == max_count:
423-
j += 1
424-
elif count > max_count:
425-
max_count = count
426-
j = 0
427-
else:
428-
continue
429-
430-
modes[j] = keys[k]
431-
else:
418+
with nogil(htfunc_t is not object):
432419
for k in range(nkeys):
433420
count = counts[k]
434421
if count == max_count:

pandas/_libs/internals.pyx

+3-3
Original file line numberDiff line numberDiff line change
@@ -502,7 +502,7 @@ def get_concat_blkno_indexers(list blknos_list not None):
502502
@cython.boundscheck(False)
503503
@cython.wraparound(False)
504504
def get_blkno_indexers(
505-
int64_t[:] blknos, bint group=True
505+
const int64_t[:] blknos, bint group=True
506506
) -> list[tuple[int, slice | np.ndarray]]:
507507
"""
508508
Enumerate contiguous runs of integers in ndarray.
@@ -596,8 +596,8 @@ def get_blkno_placements(blknos, group: bool = True):
596596
@cython.boundscheck(False)
597597
@cython.wraparound(False)
598598
cpdef update_blklocs_and_blknos(
599-
ndarray[intp_t, ndim=1] blklocs,
600-
ndarray[intp_t, ndim=1] blknos,
599+
const intp_t[:] blklocs,
600+
const intp_t[:] blknos,
601601
Py_ssize_t loc,
602602
intp_t nblocks,
603603
):

pandas/_libs/join.pyx

+10-7
Original file line numberDiff line numberDiff line change
@@ -225,7 +225,10 @@ def full_outer_join(const intp_t[:] left, const intp_t[:] right,
225225

226226
@cython.wraparound(False)
227227
@cython.boundscheck(False)
228-
cdef void _get_result_indexer(intp_t[::1] sorter, intp_t[::1] indexer) noexcept nogil:
228+
cdef void _get_result_indexer(
229+
const intp_t[::1] sorter,
230+
intp_t[::1] indexer,
231+
) noexcept nogil:
229232
"""NOTE: overwrites indexer with the result to avoid allocating another array"""
230233
cdef:
231234
Py_ssize_t i, n, idx
@@ -681,8 +684,8 @@ def outer_join_indexer(ndarray[numeric_object_t] left, ndarray[numeric_object_t]
681684
from pandas._libs.hashtable cimport Int64HashTable
682685

683686

684-
def asof_join_backward_on_X_by_Y(ndarray[numeric_t] left_values,
685-
ndarray[numeric_t] right_values,
687+
def asof_join_backward_on_X_by_Y(const numeric_t[:] left_values,
688+
const numeric_t[:] right_values,
686689
const int64_t[:] left_by_values,
687690
const int64_t[:] right_by_values,
688691
bint allow_exact_matches=True,
@@ -752,8 +755,8 @@ def asof_join_backward_on_X_by_Y(ndarray[numeric_t] left_values,
752755
return left_indexer, right_indexer
753756

754757

755-
def asof_join_forward_on_X_by_Y(ndarray[numeric_t] left_values,
756-
ndarray[numeric_t] right_values,
758+
def asof_join_forward_on_X_by_Y(const numeric_t[:] left_values,
759+
const numeric_t[:] right_values,
757760
const int64_t[:] left_by_values,
758761
const int64_t[:] right_by_values,
759762
bint allow_exact_matches=1,
@@ -824,8 +827,8 @@ def asof_join_forward_on_X_by_Y(ndarray[numeric_t] left_values,
824827
return left_indexer, right_indexer
825828

826829

827-
def asof_join_nearest_on_X_by_Y(ndarray[numeric_t] left_values,
828-
ndarray[numeric_t] right_values,
830+
def asof_join_nearest_on_X_by_Y(const numeric_t[:] left_values,
831+
const numeric_t[:] right_values,
829832
const int64_t[:] left_by_values,
830833
const int64_t[:] right_by_values,
831834
bint allow_exact_matches=True,

pandas/_libs/lib.pyx

+2-4
Original file line numberDiff line numberDiff line change
@@ -981,16 +981,14 @@ def get_level_sorter(
981981

982982
@cython.boundscheck(False)
983983
@cython.wraparound(False)
984-
def count_level_2d(ndarray[uint8_t, ndim=2, cast=True] mask,
984+
def count_level_2d(const uint8_t[:, :] mask,
985985
const intp_t[:] labels,
986986
Py_ssize_t max_bin,
987987
):
988988
cdef:
989-
Py_ssize_t i, j, k, n
989+
Py_ssize_t i, j, k = mask.shape[1], n = mask.shape[0]
990990
ndarray[int64_t, ndim=2] counts
991991

992-
n, k = (<object>mask).shape
993-
994992
counts = np.zeros((n, max_bin), dtype="i8")
995993
with nogil:
996994
for i in range(n):

pandas/_libs/reshape.pyx

+1-21
Original file line numberDiff line numberDiff line change
@@ -40,27 +40,7 @@ def unstack(const numeric_object_t[:, :] values, const uint8_t[:] mask,
4040
cdef:
4141
Py_ssize_t i, j, w, nulls, s, offset
4242

43-
if numeric_object_t is not object:
44-
# evaluated at compile-time
45-
with nogil:
46-
for i in range(stride):
47-
48-
nulls = 0
49-
for j in range(length):
50-
51-
for w in range(width):
52-
53-
offset = j * width + w
54-
55-
if mask[offset]:
56-
s = i * width + w
57-
new_values[j, s] = values[offset - nulls, i]
58-
new_mask[j, s] = 1
59-
else:
60-
nulls += 1
61-
62-
else:
63-
# object-dtype, identical to above but we cannot use nogil
43+
with nogil(numeric_object_t is not object):
6444
for i in range(stride):
6545

6646
nulls = 0

pandas/core/strings/accessor.py

+14-2
Original file line numberDiff line numberDiff line change
@@ -34,6 +34,7 @@
3434
is_numeric_dtype,
3535
is_object_dtype,
3636
is_re,
37+
is_string_dtype,
3738
)
3839
from pandas.core.dtypes.dtypes import (
3940
ArrowDtype,
@@ -2102,7 +2103,9 @@ def slice_replace(self, start=None, stop=None, repl=None):
21022103
result = self._data.array._str_slice_replace(start, stop, repl)
21032104
return self._wrap_result(result)
21042105

2105-
def decode(self, encoding, errors: str = "strict"):
2106+
def decode(
2107+
self, encoding, errors: str = "strict", dtype: str | DtypeObj | None = None
2108+
):
21062109
"""
21072110
Decode character string in the Series/Index using indicated encoding.
21082111
@@ -2116,6 +2119,12 @@ def decode(self, encoding, errors: str = "strict"):
21162119
errors : str, optional
21172120
Specifies the error handling scheme.
21182121
Possible values are those supported by :meth:`bytes.decode`.
2122+
dtype : str or dtype, optional
2123+
The dtype of the result. When not ``None``, must be either a string or
2124+
object dtype. When ``None``, the dtype of the result is determined by
2125+
``pd.options.future.infer_string``.
2126+
2127+
.. versionadded:: 2.3.0
21192128
21202129
Returns
21212130
-------
@@ -2137,6 +2146,10 @@ def decode(self, encoding, errors: str = "strict"):
21372146
2 ()
21382147
dtype: object
21392148
"""
2149+
if dtype is not None and not is_string_dtype(dtype):
2150+
raise ValueError(f"dtype must be string or object, got {dtype=}")
2151+
if dtype is None and get_option("future.infer_string"):
2152+
dtype = "str"
21402153
# TODO: Add a similar _bytes interface.
21412154
if encoding in _cpython_optimized_decoders:
21422155
# CPython optimized implementation
@@ -2146,7 +2159,6 @@ def decode(self, encoding, errors: str = "strict"):
21462159
f = lambda x: decoder(x, errors)[0]
21472160
arr = self._data.array
21482161
result = arr._str_map(f)
2149-
dtype = "str" if get_option("future.infer_string") else None
21502162
return self._wrap_result(result, dtype=dtype)
21512163

21522164
@forbid_nonstring_types(["bytes"])

pandas/tests/strings/test_strings.py

+24
Original file line numberDiff line numberDiff line change
@@ -601,6 +601,30 @@ def test_decode_errors_kwarg():
601601
tm.assert_series_equal(result, expected)
602602

603603

604+
def test_decode_string_dtype(string_dtype):
605+
# https://github.com/pandas-dev/pandas/pull/60940
606+
ser = Series([b"a", b"b"])
607+
result = ser.str.decode("utf-8", dtype=string_dtype)
608+
expected = Series(["a", "b"], dtype=string_dtype)
609+
tm.assert_series_equal(result, expected)
610+
611+
612+
def test_decode_object_dtype(object_dtype):
613+
# https://github.com/pandas-dev/pandas/pull/60940
614+
ser = Series([b"a", rb"\ud800"])
615+
result = ser.str.decode("utf-8", dtype=object_dtype)
616+
expected = Series(["a", r"\ud800"], dtype=object_dtype)
617+
tm.assert_series_equal(result, expected)
618+
619+
620+
def test_decode_bad_dtype():
621+
# https://github.com/pandas-dev/pandas/pull/60940
622+
ser = Series([b"a", b"b"])
623+
msg = "dtype must be string or object, got dtype='int64'"
624+
with pytest.raises(ValueError, match=msg):
625+
ser.str.decode("utf-8", dtype="int64")
626+
627+
604628
@pytest.mark.parametrize(
605629
"form, expected",
606630
[

0 commit comments

Comments
 (0)