Skip to content

Commit ca10c55

Browse files
authored
Merge branch 'main' into issue-37210-to-sql-truncate
2 parents e8930d7 + a81d52f commit ca10c55

21 files changed

+464
-34
lines changed

doc/source/whatsnew/v3.0.0.rst

+2
Original file line numberDiff line numberDiff line change
@@ -55,6 +55,7 @@ Other enhancements
5555
- :meth:`Series.plot` now correctly handle the ``ylabel`` parameter for pie charts, allowing for explicit control over the y-axis label (:issue:`58239`)
5656
- :meth:`DataFrame.plot.scatter` argument ``c`` now accepts a column of strings, where rows with the same string are colored identically (:issue:`16827` and :issue:`16485`)
5757
- :func:`read_parquet` accepts ``to_pandas_kwargs`` which are forwarded to :meth:`pyarrow.Table.to_pandas` which enables passing additional keywords to customize the conversion to pandas, such as ``maps_as_pydicts`` to read the Parquet map data type as python dictionaries (:issue:`56842`)
58+
- :meth:`.DataFrameGroupBy.transform`, :meth:`.SeriesGroupBy.transform`, :meth:`.DataFrameGroupBy.agg`, :meth:`.SeriesGroupBy.agg`, :meth:`.SeriesGroupBy.apply`, :meth:`.DataFrameGroupBy.apply` now support ``kurt`` (:issue:`40139`)
5859
- :meth:`DataFrameGroupBy.transform`, :meth:`SeriesGroupBy.transform`, :meth:`DataFrameGroupBy.agg`, :meth:`SeriesGroupBy.agg`, :meth:`RollingGroupby.apply`, :meth:`ExpandingGroupby.apply`, :meth:`Rolling.apply`, :meth:`Expanding.apply`, :meth:`DataFrame.apply` with ``engine="numba"`` now supports positional arguments passed as kwargs (:issue:`58995`)
5960
- :meth:`Rolling.agg`, :meth:`Expanding.agg` and :meth:`ExponentialMovingWindow.agg` now accept :class:`NamedAgg` aggregations through ``**kwargs`` (:issue:`28333`)
6061
- :meth:`Series.map` can now accept kwargs to pass on to func (:issue:`59814`)
@@ -799,6 +800,7 @@ Other
799800
- Bug in :meth:`Series.dt` methods in :class:`ArrowDtype` that were returning incorrect values. (:issue:`57355`)
800801
- Bug in :meth:`Series.rank` that doesn't preserve missing values for nullable integers when ``na_option='keep'``. (:issue:`56976`)
801802
- Bug in :meth:`Series.replace` and :meth:`DataFrame.replace` inconsistently replacing matching instances when ``regex=True`` and missing values are present. (:issue:`56599`)
803+
- Bug in :meth:`Series.replace` and :meth:`DataFrame.replace` throwing ``ValueError`` when ``regex=True`` and all NA values. (:issue:`60688`)
802804
- Bug in :meth:`Series.to_string` when series contains complex floats with exponents (:issue:`60405`)
803805
- Bug in :meth:`read_csv` where chained fsspec TAR file and ``compression="infer"`` fails with ``tarfile.ReadError`` (:issue:`60028`)
804806
- Bug in Dataframe Interchange Protocol implementation was returning incorrect results for data buffers' associated dtype, for string and datetime columns (:issue:`54781`)

pandas/_libs/groupby.pyi

+9
Original file line numberDiff line numberDiff line change
@@ -97,6 +97,15 @@ def group_skew(
9797
result_mask: np.ndarray | None = ...,
9898
skipna: bool = ...,
9999
) -> None: ...
100+
def group_kurt(
101+
out: np.ndarray, # float64_t[:, ::1]
102+
counts: np.ndarray, # int64_t[::1]
103+
values: np.ndarray, # ndarray[float64_T, ndim=2]
104+
labels: np.ndarray, # const intp_t[::1]
105+
mask: np.ndarray | None = ...,
106+
result_mask: np.ndarray | None = ...,
107+
skipna: bool = ...,
108+
) -> None: ...
100109
def group_mean(
101110
out: np.ndarray, # floating[:, ::1]
102111
counts: np.ndarray, # int64_t[::1]

pandas/_libs/groupby.pyx

+96-2
Original file line numberDiff line numberDiff line change
@@ -910,7 +910,7 @@ def group_var(
910910
@cython.wraparound(False)
911911
@cython.boundscheck(False)
912912
@cython.cdivision(True)
913-
@cython.cpow
913+
@cython.cpow(True)
914914
def group_skew(
915915
float64_t[:, ::1] out,
916916
int64_t[::1] counts,
@@ -961,7 +961,7 @@ def group_skew(
961961
isna_entry = _treat_as_na(val, False)
962962

963963
if not isna_entry:
964-
# Based on RunningStats::Push from
964+
# Running stats update based on RunningStats::Push from
965965
# https://www.johndcook.com/blog/skewness_kurtosis/
966966
n1 = nobs[lab, j]
967967
n = n1 + 1
@@ -995,6 +995,100 @@ def group_skew(
995995
)
996996

997997

998+
@cython.wraparound(False)
999+
@cython.boundscheck(False)
1000+
@cython.cdivision(True)
1001+
@cython.cpow(True)
1002+
def group_kurt(
1003+
float64_t[:, ::1] out,
1004+
int64_t[::1] counts,
1005+
ndarray[float64_t, ndim=2] values,
1006+
const intp_t[::1] labels,
1007+
const uint8_t[:, ::1] mask=None,
1008+
uint8_t[:, ::1] result_mask=None,
1009+
bint skipna=True,
1010+
) -> None:
1011+
cdef:
1012+
Py_ssize_t i, j, N, K, lab, ngroups = len(counts)
1013+
int64_t[:, ::1] nobs
1014+
Py_ssize_t len_values = len(values), len_labels = len(labels)
1015+
bint isna_entry, uses_mask = mask is not None
1016+
float64_t[:, ::1] M1, M2, M3, M4
1017+
float64_t delta, delta_n, delta_n2, term1, val
1018+
int64_t n1, n
1019+
float64_t ct, num, den, adj
1020+
1021+
if len_values != len_labels:
1022+
raise ValueError("len(index) != len(labels)")
1023+
1024+
nobs = np.zeros((<object>out).shape, dtype=np.int64)
1025+
1026+
# M1, M2, M3 and M4 correspond to 1st, 2nd, 3rd and 4th Moments
1027+
M1 = np.zeros((<object>out).shape, dtype=np.float64)
1028+
M2 = np.zeros((<object>out).shape, dtype=np.float64)
1029+
M3 = np.zeros((<object>out).shape, dtype=np.float64)
1030+
M4 = np.zeros((<object>out).shape, dtype=np.float64)
1031+
1032+
N, K = (<object>values).shape
1033+
1034+
out[:, :] = 0.0
1035+
1036+
with nogil:
1037+
for i in range(N):
1038+
lab = labels[i]
1039+
if lab < 0:
1040+
continue
1041+
1042+
counts[lab] += 1
1043+
1044+
for j in range(K):
1045+
val = values[i, j]
1046+
1047+
if uses_mask:
1048+
isna_entry = mask[i, j]
1049+
else:
1050+
isna_entry = _treat_as_na(val, False)
1051+
1052+
if not isna_entry:
1053+
# Running stats update based on RunningStats::Push from
1054+
# https://www.johndcook.com/blog/skewness_kurtosis/
1055+
n1 = nobs[lab, j]
1056+
n = n1 + 1
1057+
1058+
nobs[lab, j] = n
1059+
delta = val - M1[lab, j]
1060+
delta_n = delta / n
1061+
delta_n2 = delta_n * delta_n
1062+
term1 = delta * delta_n * n1
1063+
1064+
M1[lab, j] += delta_n
1065+
M4[lab, j] += (term1 * delta_n2 * (n*n - 3*n + 3)
1066+
+ 6 * delta_n2 * M2[lab, j]
1067+
- 4 * delta_n * M3[lab, j])
1068+
M3[lab, j] += term1 * delta_n * (n - 2) - 3 * delta_n * M2[lab, j]
1069+
M2[lab, j] += term1
1070+
elif not skipna:
1071+
M1[lab, j] = NaN
1072+
M2[lab, j] = NaN
1073+
M3[lab, j] = NaN
1074+
M4[lab, j] = NaN
1075+
1076+
for i in range(ngroups):
1077+
for j in range(K):
1078+
ct = <float64_t>nobs[i, j]
1079+
if ct < 4:
1080+
if result_mask is not None:
1081+
result_mask[i, j] = 1
1082+
out[i, j] = NaN
1083+
elif M2[i, j] == 0:
1084+
out[i, j] = 0
1085+
else:
1086+
num = ct * (ct + 1) * (ct - 1) * M4[i, j]
1087+
den = (ct - 2) * (ct - 3) * M2[i, j] ** 2
1088+
adj = 3.0 * (ct - 1) ** 2 / ((ct - 2) * (ct - 3))
1089+
out[i, j] = num / den - adj
1090+
1091+
9981092
@cython.wraparound(False)
9991093
@cython.boundscheck(False)
10001094
def group_mean(

pandas/core/array_algos/replace.py

+2-1
Original file line numberDiff line numberDiff line change
@@ -89,7 +89,8 @@ def _check_comparison_types(
8989
op = np.vectorize(
9090
lambda x: bool(re.search(b, x))
9191
if isinstance(x, str) and isinstance(b, (str, Pattern))
92-
else False
92+
else False,
93+
otypes=[bool],
9394
)
9495

9596
# GH#32621 use mask to avoid comparing to NAs

pandas/core/arrays/base.py

+1
Original file line numberDiff line numberDiff line change
@@ -2618,6 +2618,7 @@ def _groupby_op(
26182618
"sem",
26192619
"var",
26202620
"skew",
2621+
"kurt",
26212622
]:
26222623
raise TypeError(
26232624
f"dtype '{self.dtype}' does not support operation '{how}'"

pandas/core/arrays/categorical.py

+1-1
Original file line numberDiff line numberDiff line change
@@ -2736,7 +2736,7 @@ def _groupby_op(
27362736
op = WrappedCythonOp(how=how, kind=kind, has_dropped_na=has_dropped_na)
27372737

27382738
dtype = self.dtype
2739-
if how in ["sum", "prod", "cumsum", "cumprod", "skew"]:
2739+
if how in ["sum", "prod", "cumsum", "cumprod", "skew", "kurt"]:
27402740
raise TypeError(f"{dtype} type does not support {how} operations")
27412741
if how in ["min", "max", "rank", "idxmin", "idxmax"] and not dtype.ordered:
27422742
# raise TypeError instead of NotImplementedError to ensure we

pandas/core/arrays/datetimelike.py

+3-3
Original file line numberDiff line numberDiff line change
@@ -1656,7 +1656,7 @@ def _groupby_op(
16561656
dtype = self.dtype
16571657
if dtype.kind == "M":
16581658
# Adding/multiplying datetimes is not valid
1659-
if how in ["sum", "prod", "cumsum", "cumprod", "var", "skew"]:
1659+
if how in ["sum", "prod", "cumsum", "cumprod", "var", "skew", "kurt"]:
16601660
raise TypeError(f"datetime64 type does not support operation '{how}'")
16611661
if how in ["any", "all"]:
16621662
# GH#34479
@@ -1667,7 +1667,7 @@ def _groupby_op(
16671667

16681668
elif isinstance(dtype, PeriodDtype):
16691669
# Adding/multiplying Periods is not valid
1670-
if how in ["sum", "prod", "cumsum", "cumprod", "var", "skew"]:
1670+
if how in ["sum", "prod", "cumsum", "cumprod", "var", "skew", "kurt"]:
16711671
raise TypeError(f"Period type does not support {how} operations")
16721672
if how in ["any", "all"]:
16731673
# GH#34479
@@ -1677,7 +1677,7 @@ def _groupby_op(
16771677
)
16781678
else:
16791679
# timedeltas we can add but not multiply
1680-
if how in ["prod", "cumprod", "skew", "var"]:
1680+
if how in ["prod", "cumprod", "skew", "kurt", "var"]:
16811681
raise TypeError(f"timedelta64 type does not support {how} operations")
16821682

16831683
# All of the functions implemented here are ordinal, so we can

pandas/core/groupby/base.py

+1
Original file line numberDiff line numberDiff line change
@@ -50,6 +50,7 @@ class OutputKey:
5050
"sem",
5151
"size",
5252
"skew",
53+
"kurt",
5354
"std",
5455
"sum",
5556
"var",

0 commit comments

Comments
 (0)