Merge branch 'pandas-dev:main' into Zanir-testing

ZanirP · web-flow · commit 5c7e193b9347 · 2025-03-13T18:26:24.000-05:00
diff --git a/doc/source/whatsnew/v2.3.0.rst b/doc/source/whatsnew/v2.3.0.rst
@@ -37,6 +37,7 @@ Other enhancements
   updated to work correctly with NumPy >= 2 (:issue:`57739`)
 - :meth:`Series.str.decode` result now has ``StringDtype`` when ``future.infer_string`` is True (:issue:`60709`)
 - :meth:`~Series.to_hdf` and :meth:`~DataFrame.to_hdf` now round-trip with ``StringDtype``  (:issue:`60663`)
+- Improved ``repr`` of :class:`.NumpyExtensionArray` to account for NEP51 (:issue:`61085`)
 - The :meth:`Series.str.decode` has gained the argument ``dtype`` to control the dtype of the result (:issue:`60940`)
 - The :meth:`~Series.cumsum`, :meth:`~Series.cummin`, and :meth:`~Series.cummax` reductions are now implemented for ``StringDtype`` columns (:issue:`60633`)
 - The :meth:`~Series.sum` reduction is now implemented for ``StringDtype`` columns (:issue:`59853`)
diff --git a/doc/source/whatsnew/v3.0.0.rst b/doc/source/whatsnew/v3.0.0.rst
@@ -61,6 +61,7 @@ Other enhancements
 - :meth:`Series.cummin` and :meth:`Series.cummax` now supports :class:`CategoricalDtype` (:issue:`52335`)
 - :meth:`Series.plot` now correctly handle the ``ylabel`` parameter for pie charts, allowing for explicit control over the y-axis label (:issue:`58239`)
 - :meth:`DataFrame.plot.scatter` argument ``c`` now accepts a column of strings, where rows with the same string are colored identically (:issue:`16827` and :issue:`16485`)
+- :class:`ArrowDtype` now supports ``pyarrow.JsonType`` (:issue:`60958`)
 - :class:`DataFrameGroupBy` and :class:`SeriesGroupBy` methods ``sum``, ``mean``, ``median``, ``prod``, ``min``, ``max``, ``std``, ``var`` and ``sem`` now accept ``skipna`` parameter (:issue:`15675`)
 - :class:`Rolling` and :class:`Expanding` now support ``nunique`` (:issue:`26958`)
 - :class:`Rolling` and :class:`Expanding` now support aggregations ``first`` and ``last`` (:issue:`33155`)
@@ -782,6 +783,7 @@ Reshaping
 ^^^^^^^^^
 - Bug in :func:`qcut` where values at the quantile boundaries could be incorrectly assigned (:issue:`59355`)
 - Bug in :meth:`DataFrame.combine_first` not preserving the column order (:issue:`60427`)
+- Bug in :meth:`DataFrame.explode` producing incorrect result for :class:`pyarrow.large_list` type (:issue:`61091`)
 - Bug in :meth:`DataFrame.join` inconsistently setting result index name (:issue:`55815`)
 - Bug in :meth:`DataFrame.join` when a :class:`DataFrame` with a :class:`MultiIndex` would raise an ``AssertionError`` when :attr:`MultiIndex.names` contained ``None``. (:issue:`58721`)
 - Bug in :meth:`DataFrame.merge` where merging on a column containing only ``NaN`` values resulted in an out-of-bounds array access (:issue:`59421`)
diff --git a/pandas/_libs/lib.pyx b/pandas/_libs/lib.pyx
@@ -1518,7 +1518,7 @@ cdef object _try_infer_map(object dtype):
 
 def infer_dtype(value: object, skipna: bool = True) -> str:
     """
-    Return a string label of the type of a scalar or list-like of values.
+    Return a string label of the type of the elements in a list-like input.
 
     This method inspects the elements of the provided input and determines
     classification of its data type. It is particularly useful for
@@ -1527,7 +1527,7 @@ def infer_dtype(value: object, skipna: bool = True) -> str:
 
     Parameters
     ----------
-    value : scalar, list, ndarray, or pandas type
+    value : list, ndarray, or pandas type
         The input data to infer the dtype.
     skipna : bool, default True
         Ignore NaN values when inferring the type.
diff --git a/pandas/compat/__init__.py b/pandas/compat/__init__.py
@@ -35,6 +35,7 @@
     pa_version_under17p0,
     pa_version_under18p0,
     pa_version_under19p0,
+    pa_version_under20p0,
 )
 
 if TYPE_CHECKING:
@@ -168,4 +169,5 @@ def is_ci_environment() -> bool:
     "pa_version_under17p0",
     "pa_version_under18p0",
     "pa_version_under19p0",
+    "pa_version_under20p0",
 ]
diff --git a/pandas/core/arrays/arrow/array.py b/pandas/core/arrays/arrow/array.py
@@ -1938,7 +1938,10 @@ def _explode(self):
         """
         # child class explode method supports only list types; return
         # default implementation for non list types.
-        if not pa.types.is_list(self.dtype.pyarrow_dtype):
+        if not (
+            pa.types.is_list(self.dtype.pyarrow_dtype)
+            or pa.types.is_large_list(self.dtype.pyarrow_dtype)
+        ):
             return super()._explode()
         values = self
         counts = pa.compute.list_value_length(values._pa_array)
diff --git a/pandas/core/arrays/numpy_.py b/pandas/core/arrays/numpy_.py
@@ -2,6 +2,7 @@
 
 from typing import (
     TYPE_CHECKING,
+    Any,
     Literal,
 )
 
@@ -29,6 +30,8 @@
 from pandas.core.strings.object_array import ObjectStringArrayMixin
 
 if TYPE_CHECKING:
+    from collections.abc import Callable
+
     from pandas._typing import (
         AxisInt,
         Dtype,
@@ -565,3 +568,12 @@ def _wrap_ndarray_result(self, result: np.ndarray):
 
             return TimedeltaArray._simple_new(result, dtype=result.dtype)
         return type(self)(result)
+
+    def _formatter(self, boxed: bool = False) -> Callable[[Any], str | None]:
+        # NEP 51: https://github.com/numpy/numpy/pull/22449
+        if self.dtype.kind in "SU":
+            return "'{}'".format
+        elif self.dtype == "object":
+            return repr
+        else:
+            return str
diff --git a/pandas/core/dtypes/dtypes.py b/pandas/core/dtypes/dtypes.py
@@ -2265,7 +2265,7 @@ def type(self):
         elif pa.types.is_null(pa_type):
             # TODO: None? pd.NA? pa.null?
             return type(pa_type)
-        elif isinstance(pa_type, pa.ExtensionType):
+        elif isinstance(pa_type, pa.BaseExtensionType):
             return type(self)(pa_type.storage_type).type
         raise NotImplementedError(pa_type)
 
diff --git a/pandas/core/indexes/base.py b/pandas/core/indexes/base.py
@@ -4922,6 +4922,10 @@ def values(self) -> ArrayLike:
            :meth:`Index.to_numpy`, depending on whether you need
            a reference to the underlying data or a NumPy array.
 
+        .. versionchanged:: 3.0.0
+
+           The returned array is read-only.
+
         Returns
         -------
         array: numpy.ndarray or ExtensionArray
diff --git a/pandas/tests/arrays/numpy_/test_numpy.py b/pandas/tests/arrays/numpy_/test_numpy.py
@@ -22,15 +22,15 @@
         np.array([True, False], dtype=bool),
         np.array([0, 1], dtype="datetime64[ns]"),
         np.array([0, 1], dtype="timedelta64[ns]"),
-    ]
+    ],
 )
 def any_numpy_array(request):
     """
     Parametrized fixture for NumPy arrays with different dtypes.
 
     This excludes string and bytes.
     """
-    return request.param
+    return request.param.copy()
 
 
 # ----------------------------------------------------------------------------
@@ -323,3 +323,30 @@ def test_factorize_unsigned():
     tm.assert_numpy_array_equal(res_codes, exp_codes)
 
     tm.assert_extension_array_equal(res_unique, NumpyExtensionArray(exp_unique))
+
+
+# ----------------------------------------------------------------------------
+# Output formatting
+
+
+def test_array_repr(any_numpy_array):
+    # GH#61085
+    nparray = any_numpy_array
+    arr = NumpyExtensionArray(nparray)
+    if nparray.dtype == "object":
+        values = "['a', 'b']"
+    elif nparray.dtype == "float64":
+        values = "[0.0, 1.0]"
+    elif str(nparray.dtype).startswith("int"):
+        values = "[0, 1]"
+    elif nparray.dtype == "complex128":
+        values = "[0j, (1+2j)]"
+    elif nparray.dtype == "bool":
+        values = "[True, False]"
+    elif nparray.dtype == "datetime64[ns]":
+        values = "[1970-01-01T00:00:00.000000000, 1970-01-01T00:00:00.000000001]"
+    elif nparray.dtype == "timedelta64[ns]":
+        values = "[0 nanoseconds, 1 nanoseconds]"
+    expected = f"<NumpyExtensionArray>\n{values}\nLength: 2, dtype: {nparray.dtype}"
+    result = repr(arr)
+    assert result == expected, f"{result} vs {expected}"
diff --git a/pandas/tests/extension/test_arrow.py b/pandas/tests/extension/test_arrow.py
@@ -42,6 +42,8 @@
     pa_version_under11p0,
     pa_version_under13p0,
     pa_version_under14p0,
+    pa_version_under19p0,
+    pa_version_under20p0,
 )
 
 from pandas.core.dtypes.dtypes import (
@@ -453,31 +455,24 @@ def test_accumulate_series(self, data, all_numeric_accumulations, skipna, reques
         self.check_accumulate(ser, op_name, skipna)
 
     def _supports_reduction(self, ser: pd.Series, op_name: str) -> bool:
-        if op_name in ["kurt", "skew"]:
+        if op_name == "kurt" or (pa_version_under20p0 and op_name == "skew"):
             return False
 
         dtype = ser.dtype
         # error: Item "dtype[Any]" of "dtype[Any] | ExtensionDtype" has
         # no attribute "pyarrow_dtype"
         pa_dtype = dtype.pyarrow_dtype  # type: ignore[union-attr]
-        if pa.types.is_temporal(pa_dtype) and op_name in ["sum", "var", "prod"]:
+        if pa.types.is_temporal(pa_dtype) and op_name in ["sum", "var", "prod", "skew"]:
             if pa.types.is_duration(pa_dtype) and op_name in ["sum"]:
                 # summing timedeltas is one case that *is* well-defined
                 pass
             else:
                 return False
-        elif pa.types.is_binary(pa_dtype) and op_name == "sum":
+        elif pa.types.is_binary(pa_dtype) and op_name in ["sum", "skew"]:
             return False
         elif (
             pa.types.is_string(pa_dtype) or pa.types.is_binary(pa_dtype)
-        ) and op_name in [
-            "mean",
-            "median",
-            "prod",
-            "std",
-            "sem",
-            "var",
-        ]:
+        ) and op_name in ["mean", "median", "prod", "std", "sem", "var", "skew"]:
             return False
 
         if (
@@ -561,7 +556,7 @@ def _get_expected_reduction_dtype(self, arr, op_name: str, skipna: bool):
             else:
                 cmp_dtype = arr.dtype
         elif arr.dtype.name == "decimal128(7, 3)[pyarrow]":
-            if op_name not in ["median", "var", "std", "sem"]:
+            if op_name not in ["median", "var", "std", "sem", "skew"]:
                 cmp_dtype = arr.dtype
             else:
                 cmp_dtype = "float64[pyarrow]"
@@ -579,10 +574,29 @@ def _get_expected_reduction_dtype(self, arr, op_name: str, skipna: bool):
             }[arr.dtype.kind]
         return cmp_dtype
 
+    @pytest.mark.filterwarnings("ignore::RuntimeWarning")
+    @pytest.mark.parametrize("skipna", [True, False])
+    def test_reduce_series_numeric(self, data, all_numeric_reductions, skipna, request):
+        if (
+            not pa_version_under20p0
+            and skipna
+            and all_numeric_reductions == "skew"
+            and (
+                pa.types.is_integer(data.dtype.pyarrow_dtype)
+                or pa.types.is_floating(data.dtype.pyarrow_dtype)
+            )
+        ):
+            request.applymarker(
+                pytest.mark.xfail(
+                    reason="https://github.com/apache/arrow/issues/45733",
+                )
+            )
+        return super().test_reduce_series_numeric(data, all_numeric_reductions, skipna)
+
     @pytest.mark.parametrize("skipna", [True, False])
     def test_reduce_frame(self, data, all_numeric_reductions, skipna, request):
         op_name = all_numeric_reductions
-        if op_name == "skew":
+        if op_name == "skew" and pa_version_under20p0:
             if data.dtype._is_numeric:
                 mark = pytest.mark.xfail(reason="skew not implemented")
                 request.applymarker(mark)
@@ -3540,3 +3554,13 @@ def test_categorical_from_arrow_dictionary():
         dtype="int64",
     )
     tm.assert_series_equal(result, expected)
+
+
+@pytest.mark.skipif(
+    pa_version_under19p0, reason="pa.json_ was introduced in pyarrow v19.0"
+)
+def test_arrow_json_type():
+    # GH 60958
+    dtype = ArrowDtype(pa.json_(pa.string()))
+    result = dtype.type
+    assert result == str
diff --git a/pandas/tests/io/pytables/test_retain_attributes.py b/pandas/tests/io/pytables/test_retain_attributes.py
@@ -90,3 +90,16 @@ def test_retain_index_attributes2(tmp_path, setup_path):
         df2.to_hdf(path, key="data", append=True)
 
     assert read_hdf(path, "data").index.name is None
+
+
+def test_retain_datetime_attribute(tmp_path, setup_path):
+    path = tmp_path / setup_path
+    ser = Series(
+        ["2024-08-26 15:13:14", "2024-08-26 15:14:14"],
+        dtype="datetime64[us, UTC]",
+    )
+    dataframe = DataFrame(ser)
+    dataframe.to_hdf(path, key="Annotations", mode="w")
+
+    recovered_dataframe = read_hdf(path, key="Annotations")
+    tm.assert_frame_equal(dataframe, recovered_dataframe)
diff --git a/pandas/tests/series/methods/test_explode.py b/pandas/tests/series/methods/test_explode.py
@@ -145,8 +145,9 @@ def test_explode_scalars_can_ignore_index():
 
 
 @pytest.mark.parametrize("ignore_index", [True, False])
-def test_explode_pyarrow_list_type(ignore_index):
-    # GH 53602
+@pytest.mark.parametrize("list_type", ["list_", "large_list"])
+def test_explode_pyarrow_list_type(ignore_index, list_type):
+    # GH 53602, 61091
     pa = pytest.importorskip("pyarrow")
 
     data = [
@@ -156,7 +157,7 @@ def test_explode_pyarrow_list_type(ignore_index):
         [2, 3],
         None,
     ]
-    ser = pd.Series(data, dtype=pd.ArrowDtype(pa.list_(pa.int64())))
+    ser = pd.Series(data, dtype=pd.ArrowDtype(getattr(pa, list_type)(pa.int64())))
     result = ser.explode(ignore_index=ignore_index)
     expected = pd.Series(
         data=[None, None, 1, None, 2, 3, None],

Original file line number	Diff line number	Diff line change
`@@ -35,6 +35,7 @@`
`35`	`35`	`pa_version_under17p0,`
`36`	`36`	`pa_version_under18p0,`
`37`	`37`	`pa_version_under19p0,`
	`38`	`+ pa_version_under20p0,`
`38`	`39`	`)`
`39`	`40`
`40`	`41`	`if TYPE_CHECKING:`
`@@ -168,4 +169,5 @@ def is_ci_environment() -> bool:`
`168`	`169`	`"pa_version_under17p0",`
`169`	`170`	`"pa_version_under18p0",`
`170`	`171`	`"pa_version_under19p0",`
	`172`	`+ "pa_version_under20p0",`
`171`	`173`	`]`