Skip to content

Commit 5c7e193

Browse files
authored
Merge branch 'pandas-dev:main' into Zanir-testing
2 parents 562d625 + ddd0aa8 commit 5c7e193

File tree

12 files changed

+111
-22
lines changed

12 files changed

+111
-22
lines changed

doc/source/whatsnew/v2.3.0.rst

+1
Original file line numberDiff line numberDiff line change
@@ -37,6 +37,7 @@ Other enhancements
3737
updated to work correctly with NumPy >= 2 (:issue:`57739`)
3838
- :meth:`Series.str.decode` result now has ``StringDtype`` when ``future.infer_string`` is True (:issue:`60709`)
3939
- :meth:`~Series.to_hdf` and :meth:`~DataFrame.to_hdf` now round-trip with ``StringDtype`` (:issue:`60663`)
40+
- Improved ``repr`` of :class:`.NumpyExtensionArray` to account for NEP51 (:issue:`61085`)
4041
- The :meth:`Series.str.decode` has gained the argument ``dtype`` to control the dtype of the result (:issue:`60940`)
4142
- The :meth:`~Series.cumsum`, :meth:`~Series.cummin`, and :meth:`~Series.cummax` reductions are now implemented for ``StringDtype`` columns (:issue:`60633`)
4243
- The :meth:`~Series.sum` reduction is now implemented for ``StringDtype`` columns (:issue:`59853`)

doc/source/whatsnew/v3.0.0.rst

+2
Original file line numberDiff line numberDiff line change
@@ -61,6 +61,7 @@ Other enhancements
6161
- :meth:`Series.cummin` and :meth:`Series.cummax` now supports :class:`CategoricalDtype` (:issue:`52335`)
6262
- :meth:`Series.plot` now correctly handle the ``ylabel`` parameter for pie charts, allowing for explicit control over the y-axis label (:issue:`58239`)
6363
- :meth:`DataFrame.plot.scatter` argument ``c`` now accepts a column of strings, where rows with the same string are colored identically (:issue:`16827` and :issue:`16485`)
64+
- :class:`ArrowDtype` now supports ``pyarrow.JsonType`` (:issue:`60958`)
6465
- :class:`DataFrameGroupBy` and :class:`SeriesGroupBy` methods ``sum``, ``mean``, ``median``, ``prod``, ``min``, ``max``, ``std``, ``var`` and ``sem`` now accept ``skipna`` parameter (:issue:`15675`)
6566
- :class:`Rolling` and :class:`Expanding` now support ``nunique`` (:issue:`26958`)
6667
- :class:`Rolling` and :class:`Expanding` now support aggregations ``first`` and ``last`` (:issue:`33155`)
@@ -782,6 +783,7 @@ Reshaping
782783
^^^^^^^^^
783784
- Bug in :func:`qcut` where values at the quantile boundaries could be incorrectly assigned (:issue:`59355`)
784785
- Bug in :meth:`DataFrame.combine_first` not preserving the column order (:issue:`60427`)
786+
- Bug in :meth:`DataFrame.explode` producing incorrect result for :class:`pyarrow.large_list` type (:issue:`61091`)
785787
- Bug in :meth:`DataFrame.join` inconsistently setting result index name (:issue:`55815`)
786788
- Bug in :meth:`DataFrame.join` when a :class:`DataFrame` with a :class:`MultiIndex` would raise an ``AssertionError`` when :attr:`MultiIndex.names` contained ``None``. (:issue:`58721`)
787789
- Bug in :meth:`DataFrame.merge` where merging on a column containing only ``NaN`` values resulted in an out-of-bounds array access (:issue:`59421`)

pandas/_libs/lib.pyx

+2-2
Original file line numberDiff line numberDiff line change
@@ -1518,7 +1518,7 @@ cdef object _try_infer_map(object dtype):
15181518

15191519
def infer_dtype(value: object, skipna: bool = True) -> str:
15201520
"""
1521-
Return a string label of the type of a scalar or list-like of values.
1521+
Return a string label of the type of the elements in a list-like input.
15221522

15231523
This method inspects the elements of the provided input and determines
15241524
classification of its data type. It is particularly useful for
@@ -1527,7 +1527,7 @@ def infer_dtype(value: object, skipna: bool = True) -> str:
15271527

15281528
Parameters
15291529
----------
1530-
value : scalar, list, ndarray, or pandas type
1530+
value : list, ndarray, or pandas type
15311531
The input data to infer the dtype.
15321532
skipna : bool, default True
15331533
Ignore NaN values when inferring the type.

pandas/compat/__init__.py

+2
Original file line numberDiff line numberDiff line change
@@ -35,6 +35,7 @@
3535
pa_version_under17p0,
3636
pa_version_under18p0,
3737
pa_version_under19p0,
38+
pa_version_under20p0,
3839
)
3940

4041
if TYPE_CHECKING:
@@ -168,4 +169,5 @@ def is_ci_environment() -> bool:
168169
"pa_version_under17p0",
169170
"pa_version_under18p0",
170171
"pa_version_under19p0",
172+
"pa_version_under20p0",
171173
]

pandas/core/arrays/arrow/array.py

+4-1
Original file line numberDiff line numberDiff line change
@@ -1938,7 +1938,10 @@ def _explode(self):
19381938
"""
19391939
# child class explode method supports only list types; return
19401940
# default implementation for non list types.
1941-
if not pa.types.is_list(self.dtype.pyarrow_dtype):
1941+
if not (
1942+
pa.types.is_list(self.dtype.pyarrow_dtype)
1943+
or pa.types.is_large_list(self.dtype.pyarrow_dtype)
1944+
):
19421945
return super()._explode()
19431946
values = self
19441947
counts = pa.compute.list_value_length(values._pa_array)

pandas/core/arrays/numpy_.py

+12
Original file line numberDiff line numberDiff line change
@@ -2,6 +2,7 @@
22

33
from typing import (
44
TYPE_CHECKING,
5+
Any,
56
Literal,
67
)
78

@@ -29,6 +30,8 @@
2930
from pandas.core.strings.object_array import ObjectStringArrayMixin
3031

3132
if TYPE_CHECKING:
33+
from collections.abc import Callable
34+
3235
from pandas._typing import (
3336
AxisInt,
3437
Dtype,
@@ -565,3 +568,12 @@ def _wrap_ndarray_result(self, result: np.ndarray):
565568

566569
return TimedeltaArray._simple_new(result, dtype=result.dtype)
567570
return type(self)(result)
571+
572+
def _formatter(self, boxed: bool = False) -> Callable[[Any], str | None]:
573+
# NEP 51: https://github.com/numpy/numpy/pull/22449
574+
if self.dtype.kind in "SU":
575+
return "'{}'".format
576+
elif self.dtype == "object":
577+
return repr
578+
else:
579+
return str

pandas/core/dtypes/dtypes.py

+1-1
Original file line numberDiff line numberDiff line change
@@ -2265,7 +2265,7 @@ def type(self):
22652265
elif pa.types.is_null(pa_type):
22662266
# TODO: None? pd.NA? pa.null?
22672267
return type(pa_type)
2268-
elif isinstance(pa_type, pa.ExtensionType):
2268+
elif isinstance(pa_type, pa.BaseExtensionType):
22692269
return type(self)(pa_type.storage_type).type
22702270
raise NotImplementedError(pa_type)
22712271

pandas/core/indexes/base.py

+4
Original file line numberDiff line numberDiff line change
@@ -4922,6 +4922,10 @@ def values(self) -> ArrayLike:
49224922
:meth:`Index.to_numpy`, depending on whether you need
49234923
a reference to the underlying data or a NumPy array.
49244924
4925+
.. versionchanged:: 3.0.0
4926+
4927+
The returned array is read-only.
4928+
49254929
Returns
49264930
-------
49274931
array: numpy.ndarray or ExtensionArray

pandas/tests/arrays/numpy_/test_numpy.py

+29-2
Original file line numberDiff line numberDiff line change
@@ -22,15 +22,15 @@
2222
np.array([True, False], dtype=bool),
2323
np.array([0, 1], dtype="datetime64[ns]"),
2424
np.array([0, 1], dtype="timedelta64[ns]"),
25-
]
25+
],
2626
)
2727
def any_numpy_array(request):
2828
"""
2929
Parametrized fixture for NumPy arrays with different dtypes.
3030
3131
This excludes string and bytes.
3232
"""
33-
return request.param
33+
return request.param.copy()
3434

3535

3636
# ----------------------------------------------------------------------------
@@ -323,3 +323,30 @@ def test_factorize_unsigned():
323323
tm.assert_numpy_array_equal(res_codes, exp_codes)
324324

325325
tm.assert_extension_array_equal(res_unique, NumpyExtensionArray(exp_unique))
326+
327+
328+
# ----------------------------------------------------------------------------
329+
# Output formatting
330+
331+
332+
def test_array_repr(any_numpy_array):
333+
# GH#61085
334+
nparray = any_numpy_array
335+
arr = NumpyExtensionArray(nparray)
336+
if nparray.dtype == "object":
337+
values = "['a', 'b']"
338+
elif nparray.dtype == "float64":
339+
values = "[0.0, 1.0]"
340+
elif str(nparray.dtype).startswith("int"):
341+
values = "[0, 1]"
342+
elif nparray.dtype == "complex128":
343+
values = "[0j, (1+2j)]"
344+
elif nparray.dtype == "bool":
345+
values = "[True, False]"
346+
elif nparray.dtype == "datetime64[ns]":
347+
values = "[1970-01-01T00:00:00.000000000, 1970-01-01T00:00:00.000000001]"
348+
elif nparray.dtype == "timedelta64[ns]":
349+
values = "[0 nanoseconds, 1 nanoseconds]"
350+
expected = f"<NumpyExtensionArray>\n{values}\nLength: 2, dtype: {nparray.dtype}"
351+
result = repr(arr)
352+
assert result == expected, f"{result} vs {expected}"

pandas/tests/extension/test_arrow.py

+37-13
Original file line numberDiff line numberDiff line change
@@ -42,6 +42,8 @@
4242
pa_version_under11p0,
4343
pa_version_under13p0,
4444
pa_version_under14p0,
45+
pa_version_under19p0,
46+
pa_version_under20p0,
4547
)
4648

4749
from pandas.core.dtypes.dtypes import (
@@ -453,31 +455,24 @@ def test_accumulate_series(self, data, all_numeric_accumulations, skipna, reques
453455
self.check_accumulate(ser, op_name, skipna)
454456

455457
def _supports_reduction(self, ser: pd.Series, op_name: str) -> bool:
456-
if op_name in ["kurt", "skew"]:
458+
if op_name == "kurt" or (pa_version_under20p0 and op_name == "skew"):
457459
return False
458460

459461
dtype = ser.dtype
460462
# error: Item "dtype[Any]" of "dtype[Any] | ExtensionDtype" has
461463
# no attribute "pyarrow_dtype"
462464
pa_dtype = dtype.pyarrow_dtype # type: ignore[union-attr]
463-
if pa.types.is_temporal(pa_dtype) and op_name in ["sum", "var", "prod"]:
465+
if pa.types.is_temporal(pa_dtype) and op_name in ["sum", "var", "prod", "skew"]:
464466
if pa.types.is_duration(pa_dtype) and op_name in ["sum"]:
465467
# summing timedeltas is one case that *is* well-defined
466468
pass
467469
else:
468470
return False
469-
elif pa.types.is_binary(pa_dtype) and op_name == "sum":
471+
elif pa.types.is_binary(pa_dtype) and op_name in ["sum", "skew"]:
470472
return False
471473
elif (
472474
pa.types.is_string(pa_dtype) or pa.types.is_binary(pa_dtype)
473-
) and op_name in [
474-
"mean",
475-
"median",
476-
"prod",
477-
"std",
478-
"sem",
479-
"var",
480-
]:
475+
) and op_name in ["mean", "median", "prod", "std", "sem", "var", "skew"]:
481476
return False
482477

483478
if (
@@ -561,7 +556,7 @@ def _get_expected_reduction_dtype(self, arr, op_name: str, skipna: bool):
561556
else:
562557
cmp_dtype = arr.dtype
563558
elif arr.dtype.name == "decimal128(7, 3)[pyarrow]":
564-
if op_name not in ["median", "var", "std", "sem"]:
559+
if op_name not in ["median", "var", "std", "sem", "skew"]:
565560
cmp_dtype = arr.dtype
566561
else:
567562
cmp_dtype = "float64[pyarrow]"
@@ -579,10 +574,29 @@ def _get_expected_reduction_dtype(self, arr, op_name: str, skipna: bool):
579574
}[arr.dtype.kind]
580575
return cmp_dtype
581576

577+
@pytest.mark.filterwarnings("ignore::RuntimeWarning")
578+
@pytest.mark.parametrize("skipna", [True, False])
579+
def test_reduce_series_numeric(self, data, all_numeric_reductions, skipna, request):
580+
if (
581+
not pa_version_under20p0
582+
and skipna
583+
and all_numeric_reductions == "skew"
584+
and (
585+
pa.types.is_integer(data.dtype.pyarrow_dtype)
586+
or pa.types.is_floating(data.dtype.pyarrow_dtype)
587+
)
588+
):
589+
request.applymarker(
590+
pytest.mark.xfail(
591+
reason="https://github.com/apache/arrow/issues/45733",
592+
)
593+
)
594+
return super().test_reduce_series_numeric(data, all_numeric_reductions, skipna)
595+
582596
@pytest.mark.parametrize("skipna", [True, False])
583597
def test_reduce_frame(self, data, all_numeric_reductions, skipna, request):
584598
op_name = all_numeric_reductions
585-
if op_name == "skew":
599+
if op_name == "skew" and pa_version_under20p0:
586600
if data.dtype._is_numeric:
587601
mark = pytest.mark.xfail(reason="skew not implemented")
588602
request.applymarker(mark)
@@ -3540,3 +3554,13 @@ def test_categorical_from_arrow_dictionary():
35403554
dtype="int64",
35413555
)
35423556
tm.assert_series_equal(result, expected)
3557+
3558+
3559+
@pytest.mark.skipif(
3560+
pa_version_under19p0, reason="pa.json_ was introduced in pyarrow v19.0"
3561+
)
3562+
def test_arrow_json_type():
3563+
# GH 60958
3564+
dtype = ArrowDtype(pa.json_(pa.string()))
3565+
result = dtype.type
3566+
assert result == str

pandas/tests/io/pytables/test_retain_attributes.py

+13
Original file line numberDiff line numberDiff line change
@@ -90,3 +90,16 @@ def test_retain_index_attributes2(tmp_path, setup_path):
9090
df2.to_hdf(path, key="data", append=True)
9191

9292
assert read_hdf(path, "data").index.name is None
93+
94+
95+
def test_retain_datetime_attribute(tmp_path, setup_path):
96+
path = tmp_path / setup_path
97+
ser = Series(
98+
["2024-08-26 15:13:14", "2024-08-26 15:14:14"],
99+
dtype="datetime64[us, UTC]",
100+
)
101+
dataframe = DataFrame(ser)
102+
dataframe.to_hdf(path, key="Annotations", mode="w")
103+
104+
recovered_dataframe = read_hdf(path, key="Annotations")
105+
tm.assert_frame_equal(dataframe, recovered_dataframe)

pandas/tests/series/methods/test_explode.py

+4-3
Original file line numberDiff line numberDiff line change
@@ -145,8 +145,9 @@ def test_explode_scalars_can_ignore_index():
145145

146146

147147
@pytest.mark.parametrize("ignore_index", [True, False])
148-
def test_explode_pyarrow_list_type(ignore_index):
149-
# GH 53602
148+
@pytest.mark.parametrize("list_type", ["list_", "large_list"])
149+
def test_explode_pyarrow_list_type(ignore_index, list_type):
150+
# GH 53602, 61091
150151
pa = pytest.importorskip("pyarrow")
151152

152153
data = [
@@ -156,7 +157,7 @@ def test_explode_pyarrow_list_type(ignore_index):
156157
[2, 3],
157158
None,
158159
]
159-
ser = pd.Series(data, dtype=pd.ArrowDtype(pa.list_(pa.int64())))
160+
ser = pd.Series(data, dtype=pd.ArrowDtype(getattr(pa, list_type)(pa.int64())))
160161
result = ser.explode(ignore_index=ignore_index)
161162
expected = pd.Series(
162163
data=[None, None, 1, None, 2, 3, None],

0 commit comments

Comments
 (0)