From c55bc0a9b02ce25793fb716bfca324de823f030c Mon Sep 17 00:00:00 2001
From: Will Ayd <william.ayd@icloud.com>
Date: Mon, 30 Dec 2024 13:52:15 -0500
Subject: [PATCH 01/21] Implement first-class List type

---
 pandas/__init__.py                       |   2 +
 pandas/_testing/asserters.py             |   6 +
 pandas/core/api.py                       |   2 +
 pandas/core/arrays/list_.py              | 137 ++++++++++++++++++++++
 pandas/core/internals/blocks.py          |  11 +-
 pandas/core/internals/managers.py        |   8 +-
 pandas/core/series.py                    |   2 +-
 pandas/io/formats/format.py              |  27 ++++-
 pandas/tests/extension/list/__init__.py  |   7 --
 pandas/tests/extension/list/array.py     | 138 -----------------------
 pandas/tests/extension/list/test_list.py |  12 +-
 11 files changed, 195 insertions(+), 157 deletions(-)
 create mode 100644 pandas/core/arrays/list_.py
 delete mode 100644 pandas/tests/extension/list/__init__.py
 delete mode 100644 pandas/tests/extension/list/array.py

diff --git a/pandas/__init__.py b/pandas/__init__.py
index c570fb8d70204..0cc0a2075355b 100644
--- a/pandas/__init__.py
+++ b/pandas/__init__.py
@@ -61,6 +61,7 @@
     PeriodDtype,
     IntervalDtype,
     DatetimeTZDtype,
+    ListDtype,
     StringDtype,
     BooleanDtype,
     # missing
@@ -261,6 +262,7 @@
     "Interval",
     "IntervalDtype",
     "IntervalIndex",
+    "ListDtype",
     "MultiIndex",
     "NaT",
     "NamedAgg",
diff --git a/pandas/_testing/asserters.py b/pandas/_testing/asserters.py
index daa5187cdb636..958de0b61e542 100644
--- a/pandas/_testing/asserters.py
+++ b/pandas/_testing/asserters.py
@@ -54,6 +54,7 @@
     TimedeltaArray,
 )
 from pandas.core.arrays.datetimelike import DatetimeLikeArrayMixin
+from pandas.core.arrays.list_ import ListDtype
 from pandas.core.arrays.string_ import StringDtype
 from pandas.core.indexes.api import safe_sort_index
 
@@ -824,6 +825,11 @@ def assert_extension_array_equal(
             [np.isnan(val) for val in right._ndarray[right_na]]  # type: ignore[attr-defined]
         ), "wrong missing value sentinels"
 
+    # TODO: not every array type may be convertible to NumPy; should catch here
+    if isinstance(left.dtype, ListDtype) and isinstance(right.dtype, ListDtype):
+        assert left._pa_array == right._pa_array
+        return
+
     left_valid = left[~left_na].to_numpy(dtype=object)
     right_valid = right[~right_na].to_numpy(dtype=object)
     if check_exact:
diff --git a/pandas/core/api.py b/pandas/core/api.py
index ec12d543d8389..414b07ad802a9 100644
--- a/pandas/core/api.py
+++ b/pandas/core/api.py
@@ -40,6 +40,7 @@
     UInt32Dtype,
     UInt64Dtype,
 )
+from pandas.core.arrays.list_ import ListDtype
 from pandas.core.arrays.string_ import StringDtype
 from pandas.core.construction import array  # noqa: ICN001
 from pandas.core.flags import Flags
@@ -103,6 +104,7 @@
     "Interval",
     "IntervalDtype",
     "IntervalIndex",
+    "ListDtype",
     "MultiIndex",
     "NaT",
     "NamedAgg",
diff --git a/pandas/core/arrays/list_.py b/pandas/core/arrays/list_.py
new file mode 100644
index 0000000000000..f026565daf9a5
--- /dev/null
+++ b/pandas/core/arrays/list_.py
@@ -0,0 +1,137 @@
+from __future__ import annotations
+
+from typing import (
+    TYPE_CHECKING,
+    ClassVar,
+)
+
+import numpy as np
+
+from pandas._libs import missing as libmissing
+from pandas.compat import HAS_PYARROW
+from pandas.util._decorators import set_module
+
+from pandas.core.dtypes.base import (
+    ExtensionDtype,
+    register_extension_dtype,
+)
+from pandas.core.dtypes.common import (
+    is_object_dtype,
+    is_string_dtype,
+)
+
+from pandas.core.arrays import ExtensionArray
+
+if TYPE_CHECKING:
+    from pandas._typing import type_t
+
+import pyarrow as pa
+
+
+@register_extension_dtype
+@set_module("pandas")
+class ListDtype(ExtensionDtype):
+    """
+    An ExtensionDtype suitable for storing homogeneous lists of data.
+    """
+
+    type = list
+    name: ClassVar[str] = "list"
+
+    @property
+    def na_value(self) -> libmissing.NAType:
+        return libmissing.NA
+
+    @property
+    def kind(self) -> str:
+        # TODO: our extension interface says this field should be the
+        # NumPy type character, but no such thing exists for list
+        # this assumes a PyArrow large list
+        return "+L"
+
+    @classmethod
+    def construct_array_type(cls) -> type_t[ListArray]:
+        """
+        Return the array type associated with this dtype.
+
+        Returns
+        -------
+        type
+        """
+        return ListArray
+
+
+class ListArray(ExtensionArray):
+    dtype = ListDtype()
+    __array_priority__ = 1000
+
+    def __init__(self, values: pa.Array | pa.ChunkedArray | list | ListArray) -> None:
+        if not HAS_PYARROW:
+            raise NotImplementedError("ListArray requires pyarrow to be installed")
+
+        if isinstance(values, type(self)):
+            self._pa_array = values._pa_array
+        elif not isinstance(values, pa.ChunkedArray):
+            # To support NA, we need to create an Array first :-(
+            arr = pa.array(values, from_pandas=True)
+            self._pa_array = pa.chunked_array(arr)
+        else:
+            self._pa_array = values
+
+    @classmethod
+    def _from_sequence(cls, scalars, *, dtype=None, copy: bool = False):
+        if isinstance(scalars, ListArray):
+            return cls(scalars)
+
+        values = pa.array(scalars, from_pandas=True)
+        if values.type == "null":
+            # TODO(wayd): this is a hack to get the tests to pass, but the overall issue
+            # is that our extension types don't support parametrization but the pyarrow
+            values = pa.array(values, type=pa.list_(pa.null()))
+
+        return cls(values)
+
+    def __getitem__(self, item):
+        # PyArrow does not support NumPy's selection with an equal length
+        # mask, so let's convert those to integral positions if needed
+        if isinstance(item, np.ndarray) and item.dtype == bool:
+            pos = np.array(range(len(item)))
+            mask = pos[item]
+            return type(self)(self._pa_array.take(mask))
+        elif isinstance(item, int):  # scalar case
+            return self._pa_array[item]
+
+        return type(self)(self._pa_array[item])
+
+    def __len__(self) -> int:
+        return len(self._pa_array)
+
+    def isna(self):
+        return np.array(self._pa_array.is_null())
+
+    def take(self, indexer, allow_fill=False, fill_value=None):
+        # TODO: what do we need to do with allow_fill and fill_value here?
+        return type(self)(self._pa_array.take(indexer))
+
+    def copy(self):
+        return type(self)(self._pa_array.take(pa.array(range(len(self._pa_array)))))
+
+    def astype(self, dtype, copy=True):
+        if isinstance(dtype, type(self.dtype)) and dtype == self.dtype:
+            if copy:
+                return self.copy()
+            return self
+        elif is_string_dtype(dtype) and not is_object_dtype(dtype):
+            # numpy has problems with astype(str) for nested elements
+            # and pyarrow cannot cast from list[string] to string
+            return np.array([str(x) for x in self._pa_array], dtype=dtype)
+
+        if not copy:
+            raise TypeError(f"astype from ListArray to {dtype} requires a copy")
+
+        return np.array(self._pa_array.to_pylist(), dtype=dtype, copy=copy)
+
+    @classmethod
+    def _concat_same_type(cls, to_concat):
+        data = [x._pa_array for x in to_concat]
+        return cls(data)
diff --git a/pandas/core/internals/blocks.py b/pandas/core/internals/blocks.py
index f44ad926dda5c..a6b9caedbb579 100644
--- a/pandas/core/internals/blocks.py
+++ b/pandas/core/internals/blocks.py
@@ -576,7 +576,10 @@ def convert_dtypes(
     @final
     @cache_readonly
     def dtype(self) -> DtypeObj:
-        return self.values.dtype
+        try:
+            return self.values.dtype
+        except AttributeError:  # PyArrow fallback
+            return self.values.type
 
     @final
     def astype(
@@ -2234,12 +2237,16 @@ def new_block(
     *,
     ndim: int,
     refs: BlockValuesRefs | None = None,
+    dtype: DtypeObj | None,
 ) -> Block:
     # caller is responsible for ensuring:
     # - values is NOT a NumpyExtensionArray
     # - check_ndim/ensure_block_shape already checked
     # - maybe_coerce_values already called/unnecessary
-    klass = get_block_type(values.dtype)
+    if dtype:
+        klass = get_block_type(dtype)
+    else:
+        klass = get_block_type(values.dtype)
     return klass(values, ndim=ndim, placement=placement, refs=refs)
 
 
diff --git a/pandas/core/internals/managers.py b/pandas/core/internals/managers.py
index a3738bb25f56c..37d2d5ecf8a45 100644
--- a/pandas/core/internals/managers.py
+++ b/pandas/core/internals/managers.py
@@ -1976,14 +1976,18 @@ def from_blocks(
 
     @classmethod
     def from_array(
-        cls, array: ArrayLike, index: Index, refs: BlockValuesRefs | None = None
+        cls,
+        array: ArrayLike,
+        dtype: DtypeObj | None,
+        index: Index,
+        refs: BlockValuesRefs | None = None,
     ) -> SingleBlockManager:
         """
         Constructor for if we have an array that is not yet a Block.
         """
         array = maybe_coerce_values(array)
         bp = BlockPlacement(slice(0, len(index)))
-        block = new_block(array, placement=bp, ndim=1, refs=refs)
+        block = new_block(array, placement=bp, ndim=1, refs=refs, dtype=dtype)
         return cls(block, index)
 
     def to_2d_mgr(self, columns: Index) -> BlockManager:
diff --git a/pandas/core/series.py b/pandas/core/series.py
index 4fa8b86fa4c16..3da69f88c1051 100644
--- a/pandas/core/series.py
+++ b/pandas/core/series.py
@@ -505,7 +505,7 @@ def __init__(
                 data = data.copy()
         else:
             data = sanitize_array(data, index, dtype, copy)
-            data = SingleBlockManager.from_array(data, index, refs=refs)
+            data = SingleBlockManager.from_array(data, dtype, index, refs=refs)
 
         NDFrame.__init__(self, data)
         self.name = name
diff --git a/pandas/io/formats/format.py b/pandas/io/formats/format.py
index 46ecb2b9a8f12..d640fb419bebd 100644
--- a/pandas/io/formats/format.py
+++ b/pandas/io/formats/format.py
@@ -1103,7 +1103,11 @@ def format_array(
     List[str]
     """
     fmt_klass: type[_GenericArrayFormatter]
-    if lib.is_np_dtype(values.dtype, "M"):
+    if hasattr(values, "type") and values.type == "null":
+        fmt_klass = _NullFormatter
+    if hasattr(values, "type") and str(values.type).startswith("list"):
+        fmt_klass = _ListFormatter
+    elif lib.is_np_dtype(values.dtype, "M"):
         fmt_klass = _Datetime64Formatter
         values = cast(DatetimeArray, values)
     elif isinstance(values.dtype, DatetimeTZDtype):
@@ -1467,6 +1471,27 @@ def _format_strings(self) -> list[str]:
         return fmt_values
 
 
+class _NullFormatter(_GenericArrayFormatter):
+    def _format_strings(self) -> list[str]:
+        fmt_values = [str(x) for x in self.values]
+        return fmt_values
+
+
+class _ListFormatter(_GenericArrayFormatter):
+    def _format_strings(self) -> list[str]:
+        # TODO(wayd): This doesn't seem right - where should missing values
+        # be handled
+        fmt_values = []
+        for x in self.values:
+            pyval = x.as_py()
+            if pyval:
+                fmt_values.append(pyval)
+            else:
+                fmt_values.append("")
+
+        return fmt_values
+
+
 class _Datetime64Formatter(_GenericArrayFormatter):
     values: DatetimeArray
 
diff --git a/pandas/tests/extension/list/__init__.py b/pandas/tests/extension/list/__init__.py
deleted file mode 100644
index 0f3f2f3537788..0000000000000
--- a/pandas/tests/extension/list/__init__.py
+++ /dev/null
@@ -1,7 +0,0 @@
-from pandas.tests.extension.list.array import (
-    ListArray,
-    ListDtype,
-    make_data,
-)
-
-__all__ = ["ListArray", "ListDtype", "make_data"]
diff --git a/pandas/tests/extension/list/array.py b/pandas/tests/extension/list/array.py
deleted file mode 100644
index da53bdcb4e37e..0000000000000
--- a/pandas/tests/extension/list/array.py
+++ /dev/null
@@ -1,138 +0,0 @@
-"""
-Test extension array for storing nested data in a pandas container.
-
-The ListArray stores an ndarray of lists.
-"""
-
-from __future__ import annotations
-
-import numbers
-import string
-from typing import TYPE_CHECKING
-
-import numpy as np
-
-from pandas.core.dtypes.base import ExtensionDtype
-
-import pandas as pd
-from pandas.api.types import (
-    is_object_dtype,
-    is_string_dtype,
-)
-from pandas.core.arrays import ExtensionArray
-
-if TYPE_CHECKING:
-    from pandas._typing import type_t
-
-
-class ListDtype(ExtensionDtype):
-    type = list
-    name = "list"
-    na_value = np.nan
-
-    @classmethod
-    def construct_array_type(cls) -> type_t[ListArray]:
-        """
-        Return the array type associated with this dtype.
-
-        Returns
-        -------
-        type
-        """
-        return ListArray
-
-
-class ListArray(ExtensionArray):
-    dtype = ListDtype()
-    __array_priority__ = 1000
-
-    def __init__(self, values, dtype=None, copy=False) -> None:
-        if not isinstance(values, np.ndarray):
-            raise TypeError("Need to pass a numpy array as values")
-        for val in values:
-            if not isinstance(val, self.dtype.type) and not pd.isna(val):
-                raise TypeError("All values must be of type " + str(self.dtype.type))
-        self.data = values
-
-    @classmethod
-    def _from_sequence(cls, scalars, *, dtype=None, copy=False):
-        data = np.empty(len(scalars), dtype=object)
-        data[:] = scalars
-        return cls(data)
-
-    def __getitem__(self, item):
-        if isinstance(item, numbers.Integral):
-            return self.data[item]
-        else:
-            # slice, list-like, mask
-            return type(self)(self.data[item])
-
-    def __len__(self) -> int:
-        return len(self.data)
-
-    def isna(self):
-        return np.array(
-            [not isinstance(x, list) and np.isnan(x) for x in self.data], dtype=bool
-        )
-
-    def take(self, indexer, allow_fill=False, fill_value=None):
-        # re-implement here, since NumPy has trouble setting
-        # sized objects like UserDicts into scalar slots of
-        # an ndarary.
-        indexer = np.asarray(indexer)
-        msg = (
-            "Index is out of bounds or cannot do a "
-            "non-empty take from an empty array."
-        )
-
-        if allow_fill:
-            if fill_value is None:
-                fill_value = self.dtype.na_value
-            # bounds check
-            if (indexer < -1).any():
-                raise ValueError
-            try:
-                output = [
-                    self.data[loc] if loc != -1 else fill_value for loc in indexer
-                ]
-            except IndexError as err:
-                raise IndexError(msg) from err
-        else:
-            try:
-                output = [self.data[loc] for loc in indexer]
-            except IndexError as err:
-                raise IndexError(msg) from err
-
-        return self._from_sequence(output)
-
-    def copy(self):
-        return type(self)(self.data[:])
-
-    def astype(self, dtype, copy=True):
-        if isinstance(dtype, type(self.dtype)) and dtype == self.dtype:
-            if copy:
-                return self.copy()
-            return self
-        elif is_string_dtype(dtype) and not is_object_dtype(dtype):
-            # numpy has problems with astype(str) for nested elements
-            return np.array([str(x) for x in self.data], dtype=dtype)
-        elif not copy:
-            return np.asarray(self.data, dtype=dtype)
-        else:
-            return np.array(self.data, dtype=dtype, copy=copy)
-
-    @classmethod
-    def _concat_same_type(cls, to_concat):
-        data = np.concatenate([x.data for x in to_concat])
-        return cls(data)
-
-
-def make_data():
-    # TODO: Use a regular dict. See _NDFrameIndexer._setitem_with_indexer
-    rng = np.random.default_rng(2)
-    data = np.empty(100, dtype=object)
-    data[:] = [
-        [rng.choice(list(string.ascii_letters)) for _ in range(rng.integers(0, 10))]
-        for _ in range(100)
-    ]
-    return data
diff --git a/pandas/tests/extension/list/test_list.py b/pandas/tests/extension/list/test_list.py
index ac396cd3c60d4..a28e52c3bd4d3 100644
--- a/pandas/tests/extension/list/test_list.py
+++ b/pandas/tests/extension/list/test_list.py
@@ -1,11 +1,11 @@
 import pytest
 
 import pandas as pd
-from pandas.tests.extension.list.array import (
+from pandas.core.arrays.list_ import (
     ListArray,
     ListDtype,
-    make_data,
 )
+from pandas.tests.extension.base.constructors import BaseConstructorsTests
 
 
 @pytest.fixture
@@ -16,12 +16,12 @@ def dtype():
 @pytest.fixture
 def data():
     """Length-100 ListArray for semantics test."""
-    data = make_data()
+    # TODO: make better random data
+    data = [list("a"), list("ab"), list("abc")] * 33 + [None]
+    return ListArray(data)
 
-    while len(data[0]) == len(data[1]):
-        data = make_data()
 
-    return ListArray(data)
+class TestListArray(BaseConstructorsTests): ...
 
 
 def test_to_csv(data):

From 66d8a1d8d9a92b1f4e06db68e82787a222c903b0 Mon Sep 17 00:00:00 2001
From: Will Ayd <william.ayd@icloud.com>
Date: Tue, 31 Dec 2024 09:12:31 -0500
Subject: [PATCH 02/21] Brock feedback

---
 pandas/core/internals/blocks.py   | 11 ++---------
 pandas/core/internals/managers.py |  3 +--
 pandas/core/series.py             |  2 +-
 pandas/io/formats/format.py       |  6 +-----
 4 files changed, 5 insertions(+), 17 deletions(-)

diff --git a/pandas/core/internals/blocks.py b/pandas/core/internals/blocks.py
index a6b9caedbb579..f44ad926dda5c 100644
--- a/pandas/core/internals/blocks.py
+++ b/pandas/core/internals/blocks.py
@@ -576,10 +576,7 @@ def convert_dtypes(
     @final
     @cache_readonly
     def dtype(self) -> DtypeObj:
-        try:
-            return self.values.dtype
-        except AttributeError:  # PyArrow fallback
-            return self.values.type
+        return self.values.dtype
 
     @final
     def astype(
@@ -2237,16 +2234,12 @@ def new_block(
     *,
     ndim: int,
     refs: BlockValuesRefs | None = None,
-    dtype: DtypeObj | None,
 ) -> Block:
     # caller is responsible for ensuring:
     # - values is NOT a NumpyExtensionArray
     # - check_ndim/ensure_block_shape already checked
     # - maybe_coerce_values already called/unnecessary
-    if dtype:
-        klass = get_block_type(dtype)
-    else:
-        klass = get_block_type(values.dtype)
+    klass = get_block_type(values.dtype)
     return klass(values, ndim=ndim, placement=placement, refs=refs)
 
 
diff --git a/pandas/core/internals/managers.py b/pandas/core/internals/managers.py
index 37d2d5ecf8a45..9dc31c3cbf86f 100644
--- a/pandas/core/internals/managers.py
+++ b/pandas/core/internals/managers.py
@@ -1978,7 +1978,6 @@ def from_blocks(
     def from_array(
         cls,
         array: ArrayLike,
-        dtype: DtypeObj | None,
         index: Index,
         refs: BlockValuesRefs | None = None,
     ) -> SingleBlockManager:
@@ -1987,7 +1986,7 @@ def from_array(
         """
         array = maybe_coerce_values(array)
         bp = BlockPlacement(slice(0, len(index)))
-        block = new_block(array, placement=bp, ndim=1, refs=refs, dtype=dtype)
+        block = new_block(array, placement=bp, ndim=1, refs=refs)
         return cls(block, index)
 
     def to_2d_mgr(self, columns: Index) -> BlockManager:
diff --git a/pandas/core/series.py b/pandas/core/series.py
index 3da69f88c1051..4fa8b86fa4c16 100644
--- a/pandas/core/series.py
+++ b/pandas/core/series.py
@@ -505,7 +505,7 @@ def __init__(
                 data = data.copy()
         else:
             data = sanitize_array(data, index, dtype, copy)
-            data = SingleBlockManager.from_array(data, dtype, index, refs=refs)
+            data = SingleBlockManager.from_array(data, index, refs=refs)
 
         NDFrame.__init__(self, data)
         self.name = name
diff --git a/pandas/io/formats/format.py b/pandas/io/formats/format.py
index d640fb419bebd..70acaf5498e8d 100644
--- a/pandas/io/formats/format.py
+++ b/pandas/io/formats/format.py
@@ -1103,11 +1103,7 @@ def format_array(
     List[str]
     """
     fmt_klass: type[_GenericArrayFormatter]
-    if hasattr(values, "type") and values.type == "null":
-        fmt_klass = _NullFormatter
-    if hasattr(values, "type") and str(values.type).startswith("list"):
-        fmt_klass = _ListFormatter
-    elif lib.is_np_dtype(values.dtype, "M"):
+    if lib.is_np_dtype(values.dtype, "M"):
         fmt_klass = _Datetime64Formatter
         values = cast(DatetimeArray, values)
     elif isinstance(values.dtype, DatetimeTZDtype):

From ef378f7bcbbb32b7cfa95d74389ea1688a73f1ba Mon Sep 17 00:00:00 2001
From: Will Ayd <william.ayd@icloud.com>
Date: Tue, 31 Dec 2024 10:06:16 -0500
Subject: [PATCH 03/21] Test cleanups

---
 pandas/core/arrays/list_.py           | 49 +++++++++++++++++++++++++--
 pandas/core/internals/construction.py |  3 ++
 pandas/core/series.py                 |  3 +-
 3 files changed, 51 insertions(+), 4 deletions(-)

diff --git a/pandas/core/arrays/list_.py b/pandas/core/arrays/list_.py
index f026565daf9a5..e3cc50c6bcd3f 100644
--- a/pandas/core/arrays/list_.py
+++ b/pandas/core/arrays/list_.py
@@ -23,7 +23,10 @@
 from pandas.core.arrays import ExtensionArray
 
 if TYPE_CHECKING:
-    from pandas._typing import type_t
+    from pandas._typing import (
+        type_t,
+        Shape,
+    )
 
 import pyarrow as pa
 
@@ -82,8 +85,21 @@ def __init__(self, values: pa.Array | pa.ChunkedArray | list | ListArray) -> Non
     def _from_sequence(cls, scalars, *, dtype=None, copy: bool = False):
         if isinstance(scalars, ListArray):
             return cls(scalars)
+        elif isinstance(scalars, pa.Scalar):
+            scalars = [scalars]
+            return cls(scalars)
 
-        values = pa.array(scalars, from_pandas=True)
+        try:
+            values = pa.array(scalars, from_pandas=True)
+        except TypeError:
+            # TypeError: object of type 'NoneType' has no len() if you have
+            # pa.ListScalar(None). Upstream issue in Arrow - see:
+            # https://github.com/apache/arrow/issues/40319
+            for i in range(len(scalars)):
+                if not scalars[i].is_valid:
+                    scalars[i] = None
+
+            values = pa.array(scalars, from_pandas=True)
         if values.type == "null":
             # TODO(wayd): this is a hack to get the tests to pass, but the overall issue
             # is that our extension types don't support parametrization but the pyarrow
@@ -113,8 +129,35 @@ def take(self, indexer, allow_fill=False, fill_value=None):
         # TODO: what do we need to do with allow_fill and fill_value here?
         return type(self)(self._pa_array.take(indexer))
 
+    @classmethod
+    def _empty(cls, shape: Shape, dtype: ExtensionDtype):
+        """
+        Create an ExtensionArray with the given shape and dtype.
+
+        See also
+        --------
+        ExtensionDtype.empty
+            ExtensionDtype.empty is the 'official' public version of this API.
+        """
+        # Implementer note: while ExtensionDtype.empty is the public way to
+        # call this method, it is still required to implement this `_empty`
+        # method as well (it is called internally in pandas)
+        if isinstance(shape, tuple):
+            if len(shape) > 1:
+                raise ValueError("ListArray may only be 1-D")
+            else:
+                length = shape[0]
+        else:
+            length = shape
+        return cls._from_sequence([None] * length, dtype=pa.list_(pa.null()))
+
     def copy(self):
-        return type(self)(self._pa_array.take(pa.array(range(len(self._pa_array)))))
+        mm = pa.default_cpu_memory_manager()
+
+        # TODO(wayd): ChunkedArray does not implement copy_to so this
+        # ends up creating an Array
+        copied = self._pa_array.combine_chunks().copy_to(mm.device)
+        return type(self)(copied)
 
     def astype(self, dtype, copy=True):
         if isinstance(dtype, type(self.dtype)) and dtype == self.dtype:
diff --git a/pandas/core/internals/construction.py b/pandas/core/internals/construction.py
index dfff34656f82b..6bde7d3fd0d45 100644
--- a/pandas/core/internals/construction.py
+++ b/pandas/core/internals/construction.py
@@ -13,6 +13,7 @@
 
 import numpy as np
 from numpy import ma
+import pyarrow as pa
 
 from pandas._config import using_string_dtype
 
@@ -460,6 +461,8 @@ def treat_as_nested(data) -> bool:
         len(data) > 0
         and is_list_like(data[0])
         and getattr(data[0], "ndim", 1) == 1
+        # TODO(wayd): hack so pyarrow list elements don't expand
+        and not isinstance(data[0], pa.ListScalar)
         and not (isinstance(data, ExtensionArray) and data.ndim == 2)
     )
 
diff --git a/pandas/core/series.py b/pandas/core/series.py
index 4fa8b86fa4c16..612539217168b 100644
--- a/pandas/core/series.py
+++ b/pandas/core/series.py
@@ -111,6 +111,7 @@
     StructAccessor,
 )
 from pandas.core.arrays.categorical import CategoricalAccessor
+from pandas.core.arrays.list_ import ListDtype
 from pandas.core.arrays.sparse import SparseAccessor
 from pandas.core.arrays.string_ import StringDtype
 from pandas.core.construction import (
@@ -494,7 +495,7 @@ def __init__(
             if not is_list_like(data):
                 data = [data]
             index = default_index(len(data))
-        elif is_list_like(data):
+        elif is_list_like(data) and not isinstance(dtype, ListDtype):
             com.require_length_match(data, index)
 
         # create/copy the manager

From e25c0d4500be7438f9ee5d6e1d275ff8b4d185be Mon Sep 17 00:00:00 2001
From: Will Ayd <william.ayd@icloud.com>
Date: Tue, 31 Dec 2024 11:34:22 -0500
Subject: [PATCH 04/21] Fix API tests

---
 pandas/tests/api/test_api.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/pandas/tests/api/test_api.py b/pandas/tests/api/test_api.py
index c1d9f5ea4d25c..233b963633057 100644
--- a/pandas/tests/api/test_api.py
+++ b/pandas/tests/api/test_api.py
@@ -72,6 +72,7 @@ class TestPDApi(Base):
         "RangeIndex",
         "Series",
         "SparseDtype",
+        "ListDtype",
         "StringDtype",
         "Timedelta",
         "TimedeltaIndex",

From 21a69c9a1442e5e4806eea6b38cfc0fa96dc5d35 Mon Sep 17 00:00:00 2001
From: Will Ayd <william.ayd@icloud.com>
Date: Thu, 2 Jan 2025 10:44:21 -0500
Subject: [PATCH 05/21] Progress to base.ExtensionArray tests

---
 pandas/tests/extension/list/test_list.py | 48 +++++++++++++++++++++++-
 1 file changed, 47 insertions(+), 1 deletion(-)

diff --git a/pandas/tests/extension/list/test_list.py b/pandas/tests/extension/list/test_list.py
index a28e52c3bd4d3..ed46e0d1513ed 100644
--- a/pandas/tests/extension/list/test_list.py
+++ b/pandas/tests/extension/list/test_list.py
@@ -5,7 +5,32 @@
     ListArray,
     ListDtype,
 )
+from pandas.tests.extension.base.accumulate import BaseAccumulateTests
+from pandas.tests.extension.base.casting import BaseCastingTests
 from pandas.tests.extension.base.constructors import BaseConstructorsTests
+from pandas.tests.extension.base.dim2 import (  # noqa: F401
+    Dim2CompatTests,
+    NDArrayBacked2DTests,
+)
+from pandas.tests.extension.base.dtype import BaseDtypeTests
+from pandas.tests.extension.base.getitem import BaseGetitemTests
+from pandas.tests.extension.base.groupby import BaseGroupbyTests
+from pandas.tests.extension.base.index import BaseIndexTests
+from pandas.tests.extension.base.interface import BaseInterfaceTests
+from pandas.tests.extension.base.io import BaseParsingTests
+from pandas.tests.extension.base.methods import BaseMethodsTests
+from pandas.tests.extension.base.missing import BaseMissingTests
+from pandas.tests.extension.base.ops import (  # noqa: F401
+    BaseArithmeticOpsTests,
+    BaseComparisonOpsTests,
+    BaseOpsUtil,
+    BaseUnaryOpsTests,
+)
+from pandas.tests.extension.base.printing import BasePrintingTests
+from pandas.tests.extension.base.reduce import BaseReduceTests
+from pandas.tests.extension.base.reshaping import BaseReshapingTests
+from pandas.tests.extension.base.setitem import BaseSetitemTests
+
 
 
 @pytest.fixture
@@ -21,7 +46,28 @@ def data():
     return ListArray(data)
 
 
-class TestListArray(BaseConstructorsTests): ...
+class TestListArray(
+    BaseAccumulateTests,
+    #BaseCastingTests,
+    BaseConstructorsTests,
+    #BaseDtypeTests,
+    #BaseGetitemTests,
+    #BaseGroupbyTests,
+    BaseIndexTests,
+    #BaseInterfaceTests,
+    BaseParsingTests,
+    #BaseMethodsTests,
+    #BaseMissingTests,
+    #BaseArithmeticOpsTests,
+    #BaseComparisonOpsTests,
+    #BaseUnaryOpsTests,
+    #BasePrintingTests,
+    BaseReduceTests,
+    #BaseReshapingTests,
+    #BaseSetitemTests,
+    Dim2CompatTests,
+):
+    ...
 
 
 def test_to_csv(data):

From 5859e96bd249e3df8d9f77206b2eb0019f255359 Mon Sep 17 00:00:00 2001
From: Will Ayd <william.ayd@icloud.com>
Date: Thu, 2 Jan 2025 13:41:50 -0500
Subject: [PATCH 06/21] Improve test coverage

---
 pandas/core/arrays/list_.py              | 146 ++++++++++++-----------
 pandas/core/frame.py                     |   2 +-
 pandas/core/generic.py                   |   6 +-
 pandas/core/internals/construction.py    |   4 +-
 pandas/tests/extension/list/test_list.py |  65 ++++++----
 5 files changed, 126 insertions(+), 97 deletions(-)

diff --git a/pandas/core/arrays/list_.py b/pandas/core/arrays/list_.py
index e3cc50c6bcd3f..a356985698ae5 100644
--- a/pandas/core/arrays/list_.py
+++ b/pandas/core/arrays/list_.py
@@ -1,13 +1,9 @@
 from __future__ import annotations
 
-from typing import (
-    TYPE_CHECKING,
-    ClassVar,
-)
+from typing import TYPE_CHECKING
 
 import numpy as np
 
-from pandas._libs import missing as libmissing
 from pandas.compat import HAS_PYARROW
 from pandas.util._decorators import set_module
 
@@ -15,12 +11,9 @@
     ExtensionDtype,
     register_extension_dtype,
 )
-from pandas.core.dtypes.common import (
-    is_object_dtype,
-    is_string_dtype,
-)
+from pandas.core.dtypes.dtypes import ArrowDtype
 
-from pandas.core.arrays import ExtensionArray
+from pandas.core.arrays.arrow.array import ArrowExtensionArray
 
 if TYPE_CHECKING:
     from pandas._typing import (
@@ -28,28 +21,66 @@
         Shape,
     )
 
+import re
+
 import pyarrow as pa
 
 
+def string_to_pyarrow_type(string: str) -> pa.DataType:
+    # TODO: combine this with to_pyarrow_type in pandas.core.arrays.arrow ?
+    pater = r"list\[(.*)\]"
+
+    if mtch := re.search(pater, string):
+        value_type = mtch.groups()[0]
+        match value_type:
+            # TODO: is there a pyarrow function get a type from the string?
+            case "string" | "large_string":
+                return pa.large_list(pa.large_string())
+            case "int64":
+                return pa.large_list(pa.int64())
+            # TODO: need to implement many more here, including nested
+
+    raise ValueError(f"Cannot map {string} to a pyarrow list type")
+
+
 @register_extension_dtype
 @set_module("pandas")
-class ListDtype(ExtensionDtype):
+class ListDtype(ArrowDtype):
     """
     An ExtensionDtype suitable for storing homogeneous lists of data.
     """
 
-    type = list
-    name: ClassVar[str] = "list"
+    def __init__(self, value_dtype: pa.DataType) -> None:
+        super().__init__(pa.large_list(value_dtype))
+
+    @classmethod
+    def construct_from_string(cls, string: str):
+        if not isinstance(string, str):
+            raise TypeError(
+                f"'construct_from_string' expects a string, got {type(string)}"
+            )
+
+        try:
+            pa_type = string_to_pyarrow_type(string)
+        except ValueError as e:
+            raise TypeError(
+                f"Cannot construct a '{cls.__name__}' from '{string}'"
+            ) from e
+
+        return cls(pa_type)
 
     @property
-    def na_value(self) -> libmissing.NAType:
-        return libmissing.NA
+    def name(self) -> str:  # type: ignore[override]
+        """
+        A string identifying the data type.
+        """
+        return f"list[{self.pyarrow_dtype.value_type!s}]"
 
     @property
     def kind(self) -> str:
-        # TODO: our extension interface says this field should be the
+        # TODO(wayd): our extension interface says this field should be the
         # NumPy type character, but no such thing exists for list
-        # this assumes a PyArrow large list
+        # This uses the Arrow C Data exchange code instead
         return "+L"
 
     @classmethod
@@ -64,22 +95,34 @@ def construct_array_type(cls) -> type_t[ListArray]:
         return ListArray
 
 
-class ListArray(ExtensionArray):
-    dtype = ListDtype()
+class ListArray(ArrowExtensionArray):
     __array_priority__ = 1000
 
-    def __init__(self, values: pa.Array | pa.ChunkedArray | list | ListArray) -> None:
+    def __init__(
+        self, values: pa.Array | pa.ChunkedArray | list | ListArray, value_type=None
+    ) -> None:
         if not HAS_PYARROW:
             raise NotImplementedError("ListArray requires pyarrow to be installed")
 
         if isinstance(values, type(self)):
             self._pa_array = values._pa_array
-        elif not isinstance(values, pa.ChunkedArray):
-            # To support NA, we need to create an Array first :-(
-            arr = pa.array(values, from_pandas=True)
-            self._pa_array = pa.chunked_array(arr)
         else:
-            self._pa_array = values
+            if value_type is None:
+                if isinstance(values, (pa.Array, pa.ChunkedArray)):
+                    value_type = values.type.value_type
+                else:
+                    value_type = pa.array(values).type.value_type
+
+            if not isinstance(values, pa.ChunkedArray):
+                # To support NA, we need to create an Array first :-(
+                arr = pa.array(values, type=pa.large_list(value_type), from_pandas=True)
+                self._pa_array = pa.chunked_array(arr, type=pa.large_list(value_type))
+            else:
+                self._pa_array = values
+
+    @property
+    def _dtype(self):
+        return ListDtype(self._pa_array.type.value_type)
 
     @classmethod
     def _from_sequence(cls, scalars, *, dtype=None, copy: bool = False):
@@ -100,10 +143,12 @@ def _from_sequence(cls, scalars, *, dtype=None, copy: bool = False):
                     scalars[i] = None
 
             values = pa.array(scalars, from_pandas=True)
-        if values.type == "null":
-            # TODO(wayd): this is a hack to get the tests to pass, but the overall issue
-            # is that our extension types don't support parametrization but the pyarrow
-            values = pa.array(values, type=pa.list_(pa.null()))
+
+        if values.type == "null" and dtype is not None:
+            # TODO: the sequencing here seems wrong; just making the tests pass for now
+            # but this needs a comprehensive review
+            pa_type = string_to_pyarrow_type(str(dtype))
+            values = pa.array(values, type=pa_type)
 
         return cls(values)
 
@@ -114,21 +159,13 @@ def __getitem__(self, item):
             pos = np.array(range(len(item)))
             mask = pos[item]
             return type(self)(self._pa_array.take(mask))
-        elif isinstance(item, int):  # scalar case
+        elif isinstance(item, int):
             return self._pa_array[item]
+        elif isinstance(item, list):
+            return type(self)(self._pa_array.take(item))
 
         return type(self)(self._pa_array[item])
 
-    def __len__(self) -> int:
-        return len(self._pa_array)
-
-    def isna(self):
-        return np.array(self._pa_array.is_null())
-
-    def take(self, indexer, allow_fill=False, fill_value=None):
-        # TODO: what do we need to do with allow_fill and fill_value here?
-        return type(self)(self._pa_array.take(indexer))
-
     @classmethod
     def _empty(cls, shape: Shape, dtype: ExtensionDtype):
         """
@@ -149,32 +186,5 @@ def _empty(cls, shape: Shape, dtype: ExtensionDtype):
                 length = shape[0]
         else:
             length = shape
-        return cls._from_sequence([None] * length, dtype=pa.list_(pa.null()))
 
-    def copy(self):
-        mm = pa.default_cpu_memory_manager()
-
-        # TODO(wayd): ChunkedArray does not implement copy_to so this
-        # ends up creating an Array
-        copied = self._pa_array.combine_chunks().copy_to(mm.device)
-        return type(self)(copied)
-
-    def astype(self, dtype, copy=True):
-        if isinstance(dtype, type(self.dtype)) and dtype == self.dtype:
-            if copy:
-                return self.copy()
-            return self
-        elif is_string_dtype(dtype) and not is_object_dtype(dtype):
-            # numpy has problems with astype(str) for nested elements
-            # and pyarrow cannot cast from list[string] to string
-            return np.array([str(x) for x in self._pa_array], dtype=dtype)
-
-        if not copy:
-            raise TypeError(f"astype from ListArray to {dtype} requires a copy")
-
-        return np.array(self._pa_array.to_pylist(), dtype=dtype, copy=copy)
-
-    @classmethod
-    def _concat_same_type(cls, to_concat):
-        data = [x._pa_array for x in to_concat]
-        return cls(data)
+        return cls._from_sequence([None] * length, dtype=dtype)
diff --git a/pandas/core/frame.py b/pandas/core/frame.py
index 02878b36a379e..3e9be82168bf4 100644
--- a/pandas/core/frame.py
+++ b/pandas/core/frame.py
@@ -821,7 +821,7 @@ def __init__(
             if len(data) > 0:
                 if is_dataclass(data[0]):
                     data = dataclasses_to_dicts(data)
-                if not isinstance(data, np.ndarray) and treat_as_nested(data):
+                if not isinstance(data, np.ndarray) and treat_as_nested(data, dtype):
                     # exclude ndarray as we may have cast it a few lines above
                     if columns is not None:
                         columns = ensure_index(columns)
diff --git a/pandas/core/generic.py b/pandas/core/generic.py
index de7fb3682fb4f..42c2bddba02e9 100644
--- a/pandas/core/generic.py
+++ b/pandas/core/generic.py
@@ -23,6 +23,7 @@
 import warnings
 
 import numpy as np
+import pyarrow as pa
 
 from pandas._config import config
 
@@ -7036,7 +7037,8 @@ def fillna(
                 value = Series(value)
                 value = value.reindex(self.index)
                 value = value._values
-            elif not is_list_like(value):
+            elif isinstance(value, pa.ListScalar) or not is_list_like(value):
+                # TODO(wayd): maybe is_list_like should return false for ListScalar?
                 pass
             else:
                 raise TypeError(
@@ -7100,7 +7102,7 @@ def fillna(
             else:
                 return result
 
-        elif not is_list_like(value):
+        elif isinstance(value, pa.ListScalar) or not is_list_like(value):
             if axis == 1:
                 result = self.T.fillna(value=value, limit=limit).T
                 new_data = result._mgr
diff --git a/pandas/core/internals/construction.py b/pandas/core/internals/construction.py
index 6bde7d3fd0d45..873d373e8bf59 100644
--- a/pandas/core/internals/construction.py
+++ b/pandas/core/internals/construction.py
@@ -47,6 +47,7 @@
     common as com,
 )
 from pandas.core.arrays import ExtensionArray
+from pandas.core.arrays.list_ import ListDtype
 from pandas.core.arrays.string_ import StringDtype
 from pandas.core.construction import (
     array as pd_array,
@@ -453,7 +454,7 @@ def nested_data_to_arrays(
     return arrays, columns, index
 
 
-def treat_as_nested(data) -> bool:
+def treat_as_nested(data, dtype) -> bool:
     """
     Check if we should use nested_data_to_arrays.
     """
@@ -463,6 +464,7 @@ def treat_as_nested(data) -> bool:
         and getattr(data[0], "ndim", 1) == 1
         # TODO(wayd): hack so pyarrow list elements don't expand
         and not isinstance(data[0], pa.ListScalar)
+        and not isinstance(dtype, ListDtype)
         and not (isinstance(data, ExtensionArray) and data.ndim == 2)
     )
 
diff --git a/pandas/tests/extension/list/test_list.py b/pandas/tests/extension/list/test_list.py
index ed46e0d1513ed..a3f36f0c76665 100644
--- a/pandas/tests/extension/list/test_list.py
+++ b/pandas/tests/extension/list/test_list.py
@@ -1,24 +1,19 @@
+import pyarrow as pa
 import pytest
 
 import pandas as pd
+import pandas._testing as tm
 from pandas.core.arrays.list_ import (
     ListArray,
     ListDtype,
 )
 from pandas.tests.extension.base.accumulate import BaseAccumulateTests
-from pandas.tests.extension.base.casting import BaseCastingTests
 from pandas.tests.extension.base.constructors import BaseConstructorsTests
 from pandas.tests.extension.base.dim2 import (  # noqa: F401
     Dim2CompatTests,
     NDArrayBacked2DTests,
 )
-from pandas.tests.extension.base.dtype import BaseDtypeTests
-from pandas.tests.extension.base.getitem import BaseGetitemTests
-from pandas.tests.extension.base.groupby import BaseGroupbyTests
 from pandas.tests.extension.base.index import BaseIndexTests
-from pandas.tests.extension.base.interface import BaseInterfaceTests
-from pandas.tests.extension.base.io import BaseParsingTests
-from pandas.tests.extension.base.methods import BaseMethodsTests
 from pandas.tests.extension.base.missing import BaseMissingTests
 from pandas.tests.extension.base.ops import (  # noqa: F401
     BaseArithmeticOpsTests,
@@ -28,14 +23,16 @@
 )
 from pandas.tests.extension.base.printing import BasePrintingTests
 from pandas.tests.extension.base.reduce import BaseReduceTests
-from pandas.tests.extension.base.reshaping import BaseReshapingTests
-from pandas.tests.extension.base.setitem import BaseSetitemTests
 
+# TODO(wayd): This is copied from string tests - is it required here?
+# @pytest.fixture(params=[True, False])
+# def chunked(request):
+#     return request.param
 
 
 @pytest.fixture
 def dtype():
-    return ListDtype()
+    return ListDtype(pa.large_string())
 
 
 @pytest.fixture
@@ -46,28 +43,46 @@ def data():
     return ListArray(data)
 
 
+@pytest.fixture
+def data_missing(dtype):
+    """Length 2 array with [NA, Valid]"""
+    arr = dtype.construct_array_type()._from_sequence([pd.NA, [1, 2, 3]], dtype=dtype)
+    return arr
+
+
 class TestListArray(
     BaseAccumulateTests,
-    #BaseCastingTests,
+    # BaseCastingTests,
     BaseConstructorsTests,
-    #BaseDtypeTests,
-    #BaseGetitemTests,
-    #BaseGroupbyTests,
+    # BaseDtypeTests,
+    # BaseGetitemTests,
+    # BaseGroupbyTests,
     BaseIndexTests,
-    #BaseInterfaceTests,
-    BaseParsingTests,
-    #BaseMethodsTests,
-    #BaseMissingTests,
-    #BaseArithmeticOpsTests,
-    #BaseComparisonOpsTests,
-    #BaseUnaryOpsTests,
-    #BasePrintingTests,
+    # BaseInterfaceTests,
+    # BaseParsingTests,
+    # BaseMethodsTests,
+    BaseMissingTests,
+    # BaseArithmeticOpsTests,
+    # BaseComparisonOpsTests,
+    # BaseUnaryOpsTests,
+    BasePrintingTests,
     BaseReduceTests,
-    #BaseReshapingTests,
-    #BaseSetitemTests,
+    # BaseReshapingTests,
+    # BaseSetitemTests,
     Dim2CompatTests,
 ):
-    ...
+    # TODO(wayd): The tests here are copied from test_arrow.py
+    # It appears the TestArrowArray class has different expectations around
+    # when copies should be made then the base.ExtensionTests
+    # Assuming intentional, maybe in the long term this should just
+    # inherit from TestArrowArray
+    def test_fillna_no_op_returns_copy(self, data):
+        data = data[~data.isna()]
+
+        valid = data[0]
+        result = data.fillna(valid)
+        assert result is not data
+        tm.assert_extension_array_equal(result, data)
 
 
 def test_to_csv(data):

From 9edda3241d6ae6a061861cf8ee857fcd161e742b Mon Sep 17 00:00:00 2001
From: Will Ayd <william.ayd@icloud.com>
Date: Fri, 3 Jan 2025 12:26:15 -0500
Subject: [PATCH 07/21] Implement casting tests

---
 pandas/_libs/lib.pyx                     | 3 +++
 pandas/core/arrays/list_.py              | 9 +++++++++
 pandas/tests/extension/list/test_list.py | 5 ++++-
 3 files changed, 16 insertions(+), 1 deletion(-)

diff --git a/pandas/_libs/lib.pyx b/pandas/_libs/lib.pyx
index de603beff7836..10a6e0443f45d 100644
--- a/pandas/_libs/lib.pyx
+++ b/pandas/_libs/lib.pyx
@@ -834,6 +834,9 @@ cpdef ndarray[object] ensure_string_array(
             if isinstance(val, bytes):
                 # GH#49658 discussion of desired behavior here
                 result[i] = val.decode()
+            elif isinstance(val, np.ndarray):
+                # TODO(wayd): is_float_object actually returns true for this...
+                result[i] = str(val.tolist())
             elif not util.is_float_object(val):
                 # f"{val}" is faster than str(val)
                 result[i] = f"{val}"
diff --git a/pandas/core/arrays/list_.py b/pandas/core/arrays/list_.py
index a356985698ae5..6740663ae10c4 100644
--- a/pandas/core/arrays/list_.py
+++ b/pandas/core/arrays/list_.py
@@ -11,6 +11,7 @@
     ExtensionDtype,
     register_extension_dtype,
 )
+from pandas.core.dtypes.common import is_string_dtype
 from pandas.core.dtypes.dtypes import ArrowDtype
 
 from pandas.core.arrays.arrow.array import ArrowExtensionArray
@@ -18,6 +19,8 @@
 if TYPE_CHECKING:
     from pandas._typing import (
         type_t,
+        ArrayLike,
+        AstypeArg,
         Shape,
     )
 
@@ -188,3 +191,9 @@ def _empty(cls, shape: Shape, dtype: ExtensionDtype):
             length = shape
 
         return cls._from_sequence([None] * length, dtype=dtype)
+
+    def astype(self, dtype: AstypeArg, copy: bool = True) -> ArrayLike:
+        if is_string_dtype(dtype) and not isinstance(dtype, ExtensionDtype):
+            return np.array([str(x) for x in self], dtype=dtype)
+
+        return super().astype(dtype, copy)
diff --git a/pandas/tests/extension/list/test_list.py b/pandas/tests/extension/list/test_list.py
index a3f36f0c76665..c4b050952415d 100644
--- a/pandas/tests/extension/list/test_list.py
+++ b/pandas/tests/extension/list/test_list.py
@@ -50,9 +50,12 @@ def data_missing(dtype):
     return arr
 
 
+from pandas.tests.extension.base.casting import BaseCastingTests
+
+
 class TestListArray(
     BaseAccumulateTests,
-    # BaseCastingTests,
+    BaseCastingTests,
     BaseConstructorsTests,
     # BaseDtypeTests,
     # BaseGetitemTests,

From b20572daa40d36f593dc9e38380493863b0566d2 Mon Sep 17 00:00:00 2001
From: Will Ayd <william.ayd@icloud.com>
Date: Fri, 3 Jan 2025 12:34:48 -0500
Subject: [PATCH 08/21] Implement dtype tests

---
 pandas/core/arrays/list_.py              | 14 ++++++++++++++
 pandas/tests/extension/list/test_list.py | 10 ++++++----
 2 files changed, 20 insertions(+), 4 deletions(-)

diff --git a/pandas/core/arrays/list_.py b/pandas/core/arrays/list_.py
index 6740663ae10c4..7c0be18ad729c 100644
--- a/pandas/core/arrays/list_.py
+++ b/pandas/core/arrays/list_.py
@@ -21,6 +21,7 @@
         type_t,
         ArrayLike,
         AstypeArg,
+        DtypeObj,
         Shape,
     )
 
@@ -97,6 +98,19 @@ def construct_array_type(cls) -> type_t[ListArray]:
         """
         return ListArray
 
+    def _get_common_dtype(self, dtypes: list[DtypeObj]) -> DtypeObj | None:
+        # TODO(wayd): should we implemented value type support?
+        for dtype in dtypes:
+            if (
+                isinstance(dtype, ListDtype)
+                and self.pyarrow_dtype.value_type == dtype.pyarrow_dtype.value_type
+            ):
+                continue
+            else:
+                return None
+
+        return ListDtype(self.pyarrow_dtype.value_type)
+
 
 class ListArray(ArrowExtensionArray):
     __array_priority__ = 1000
diff --git a/pandas/tests/extension/list/test_list.py b/pandas/tests/extension/list/test_list.py
index c4b050952415d..c63f65ed17136 100644
--- a/pandas/tests/extension/list/test_list.py
+++ b/pandas/tests/extension/list/test_list.py
@@ -8,11 +8,13 @@
     ListDtype,
 )
 from pandas.tests.extension.base.accumulate import BaseAccumulateTests
+from pandas.tests.extension.base.casting import BaseCastingTests
 from pandas.tests.extension.base.constructors import BaseConstructorsTests
 from pandas.tests.extension.base.dim2 import (  # noqa: F401
     Dim2CompatTests,
     NDArrayBacked2DTests,
 )
+from pandas.tests.extension.base.dtype import BaseDtypeTests
 from pandas.tests.extension.base.index import BaseIndexTests
 from pandas.tests.extension.base.missing import BaseMissingTests
 from pandas.tests.extension.base.ops import (  # noqa: F401
@@ -50,14 +52,11 @@ def data_missing(dtype):
     return arr
 
 
-from pandas.tests.extension.base.casting import BaseCastingTests
-
-
 class TestListArray(
     BaseAccumulateTests,
     BaseCastingTests,
     BaseConstructorsTests,
-    # BaseDtypeTests,
+    BaseDtypeTests,
     # BaseGetitemTests,
     # BaseGroupbyTests,
     BaseIndexTests,
@@ -87,6 +86,9 @@ def test_fillna_no_op_returns_copy(self, data):
         assert result is not data
         tm.assert_extension_array_equal(result, data)
 
+    def test_kind(self, dtype):
+        assert dtype.kind == "+L"
+
 
 def test_to_csv(data):
     # https://github.com/pandas-dev/pandas/issues/28840

From 9d404e58bcfaa672c3b0344a542f8e97778765ea Mon Sep 17 00:00:00 2001
From: Will Ayd <william.ayd@icloud.com>
Date: Fri, 3 Jan 2025 12:43:46 -0500
Subject: [PATCH 09/21] Implement groupby tests

---
 pandas/tests/extension/list/test_list.py | 25 +++++++++++++++++++++++-
 1 file changed, 24 insertions(+), 1 deletion(-)

diff --git a/pandas/tests/extension/list/test_list.py b/pandas/tests/extension/list/test_list.py
index c63f65ed17136..d7a402a150e60 100644
--- a/pandas/tests/extension/list/test_list.py
+++ b/pandas/tests/extension/list/test_list.py
@@ -15,6 +15,7 @@
     NDArrayBacked2DTests,
 )
 from pandas.tests.extension.base.dtype import BaseDtypeTests
+from pandas.tests.extension.base.groupby import BaseGroupbyTests
 from pandas.tests.extension.base.index import BaseIndexTests
 from pandas.tests.extension.base.missing import BaseMissingTests
 from pandas.tests.extension.base.ops import (  # noqa: F401
@@ -52,13 +53,22 @@ def data_missing(dtype):
     return arr
 
 
+@pytest.fixture
+def data_for_grouping(dtype):
+    A = ["a"]
+    B = ["a", "b"]
+    NA = None
+    C = ["a", "b", "c"]
+    return ListArray([B, B, NA, NA, A, A, B, C])
+
+
 class TestListArray(
     BaseAccumulateTests,
     BaseCastingTests,
     BaseConstructorsTests,
     BaseDtypeTests,
     # BaseGetitemTests,
-    # BaseGroupbyTests,
+    BaseGroupbyTests,
     BaseIndexTests,
     # BaseInterfaceTests,
     # BaseParsingTests,
@@ -89,6 +99,19 @@ def test_fillna_no_op_returns_copy(self, data):
     def test_kind(self, dtype):
         assert dtype.kind == "+L"
 
+    @pytest.mark.parametrize("as_index", [True, False])
+    def test_groupby_extension_agg(self, as_index, data_for_grouping):
+        pytest.skip(reason="ListArray does not implement mean")
+
+    def test_groupby_extension_no_sort(self, data_for_grouping):
+        pytest.skip(reason="ListArray does not implement mean")
+
+    def test_groupby_extension_transform(self, data_for_grouping):
+        pytest.skip(reason="ListArray does not implement dictionary_encode")
+
+    def test_groupby_extension_apply(self, data_for_grouping, groupby_apply_op):
+        pytest.skip(reason="ListArray does not implement dictionary_encode")
+
 
 def test_to_csv(data):
     # https://github.com/pandas-dev/pandas/issues/28840

From 6e83ae0f9e8d07a772d7fb536fac5ddcb09c73d3 Mon Sep 17 00:00:00 2001
From: Will Ayd <william.ayd@icloud.com>
Date: Fri, 3 Jan 2025 13:15:44 -0500
Subject: [PATCH 10/21] Implement interface tests

---
 pandas/core/arrays/list_.py              | 18 ++++++++++++++++++
 pandas/tests/extension/list/test_list.py |  6 +++++-
 2 files changed, 23 insertions(+), 1 deletion(-)

diff --git a/pandas/core/arrays/list_.py b/pandas/core/arrays/list_.py
index 7c0be18ad729c..430b37a254d23 100644
--- a/pandas/core/arrays/list_.py
+++ b/pandas/core/arrays/list_.py
@@ -54,6 +54,8 @@ class ListDtype(ArrowDtype):
     An ExtensionDtype suitable for storing homogeneous lists of data.
     """
 
+    _is_immutable = True  # TODO(wayd): should we allow mutability?
+
     def __init__(self, value_dtype: pa.DataType) -> None:
         super().__init__(pa.large_list(value_dtype))
 
@@ -211,3 +213,19 @@ def astype(self, dtype: AstypeArg, copy: bool = True) -> ArrayLike:
             return np.array([str(x) for x in self], dtype=dtype)
 
         return super().astype(dtype, copy)
+
+    def __eq__(self, other):
+        if isinstance(other, (pa.ListScalar, pa.LargeListScalar)):
+            from pandas.arrays import BooleanArray
+
+            # TODO: pyarrow.compute does not implement broadcasting equality
+            # for an array of lists to a listscalar
+            # TODO: pyarrow doesn't compare missing values as missing???
+            # arr = pa.array([1, 2, None])
+            # pc.equal(arr, arr[2]) returns all nulls but
+            # arr[2] == arr[2] returns True
+            mask = np.array([False] * len(self))
+            values = np.array([x == other for x in self._pa_array])
+            return BooleanArray(values, mask)
+
+        return super().__eq__(other)
diff --git a/pandas/tests/extension/list/test_list.py b/pandas/tests/extension/list/test_list.py
index d7a402a150e60..c987809fc0c21 100644
--- a/pandas/tests/extension/list/test_list.py
+++ b/pandas/tests/extension/list/test_list.py
@@ -17,6 +17,7 @@
 from pandas.tests.extension.base.dtype import BaseDtypeTests
 from pandas.tests.extension.base.groupby import BaseGroupbyTests
 from pandas.tests.extension.base.index import BaseIndexTests
+from pandas.tests.extension.base.interface import BaseInterfaceTests
 from pandas.tests.extension.base.missing import BaseMissingTests
 from pandas.tests.extension.base.ops import (  # noqa: F401
     BaseArithmeticOpsTests,
@@ -70,7 +71,7 @@ class TestListArray(
     # BaseGetitemTests,
     BaseGroupbyTests,
     BaseIndexTests,
-    # BaseInterfaceTests,
+    BaseInterfaceTests,
     # BaseParsingTests,
     # BaseMethodsTests,
     BaseMissingTests,
@@ -112,6 +113,9 @@ def test_groupby_extension_transform(self, data_for_grouping):
     def test_groupby_extension_apply(self, data_for_grouping, groupby_apply_op):
         pytest.skip(reason="ListArray does not implement dictionary_encode")
 
+    def test_array_interface(self, data):
+        pytest.skip(reason="ListArrayScalar does not compare to numpy object-dtype")
+
 
 def test_to_csv(data):
     # https://github.com/pandas-dev/pandas/issues/28840

From fe6e3be8f08ab3cc9d6ab42c93ddcfae29021ff5 Mon Sep 17 00:00:00 2001
From: Will Ayd <william.ayd@icloud.com>
Date: Fri, 3 Jan 2025 13:23:18 -0500
Subject: [PATCH 11/21] Skip parsing tests

---
 pandas/tests/extension/list/test_list.py | 7 ++++++-
 1 file changed, 6 insertions(+), 1 deletion(-)

diff --git a/pandas/tests/extension/list/test_list.py b/pandas/tests/extension/list/test_list.py
index c987809fc0c21..a520255677d76 100644
--- a/pandas/tests/extension/list/test_list.py
+++ b/pandas/tests/extension/list/test_list.py
@@ -18,6 +18,7 @@
 from pandas.tests.extension.base.groupby import BaseGroupbyTests
 from pandas.tests.extension.base.index import BaseIndexTests
 from pandas.tests.extension.base.interface import BaseInterfaceTests
+from pandas.tests.extension.base.io import BaseParsingTests
 from pandas.tests.extension.base.missing import BaseMissingTests
 from pandas.tests.extension.base.ops import (  # noqa: F401
     BaseArithmeticOpsTests,
@@ -72,7 +73,7 @@ class TestListArray(
     BaseGroupbyTests,
     BaseIndexTests,
     BaseInterfaceTests,
-    # BaseParsingTests,
+    BaseParsingTests,
     # BaseMethodsTests,
     BaseMissingTests,
     # BaseArithmeticOpsTests,
@@ -116,6 +117,10 @@ def test_groupby_extension_apply(self, data_for_grouping, groupby_apply_op):
     def test_array_interface(self, data):
         pytest.skip(reason="ListArrayScalar does not compare to numpy object-dtype")
 
+    @pytest.mark.parametrize("engine", ["c", "python"])
+    def test_EA_types(self, engine, data, request):
+        pytest.skip(reason="ListArray has not implemented parsing from string")
+
 
 def test_to_csv(data):
     # https://github.com/pandas-dev/pandas/issues/28840

From 4a8ea291f7ddabae96d1edb15422e8feaaecbc14 Mon Sep 17 00:00:00 2001
From: Will Ayd <william.ayd@icloud.com>
Date: Fri, 3 Jan 2025 14:13:02 -0500
Subject: [PATCH 12/21] Implement ArithmeticOps tests

---
 pandas/tests/extension/list/test_list.py | 23 ++++++++++++++++++++++-
 1 file changed, 22 insertions(+), 1 deletion(-)

diff --git a/pandas/tests/extension/list/test_list.py b/pandas/tests/extension/list/test_list.py
index a520255677d76..a6f53f9a69e93 100644
--- a/pandas/tests/extension/list/test_list.py
+++ b/pandas/tests/extension/list/test_list.py
@@ -76,7 +76,7 @@ class TestListArray(
     BaseParsingTests,
     # BaseMethodsTests,
     BaseMissingTests,
-    # BaseArithmeticOpsTests,
+    BaseArithmeticOpsTests,
     # BaseComparisonOpsTests,
     # BaseUnaryOpsTests,
     BasePrintingTests,
@@ -121,6 +121,27 @@ def test_array_interface(self, data):
     def test_EA_types(self, engine, data, request):
         pytest.skip(reason="ListArray has not implemented parsing from string")
 
+    def test_arith_series_with_scalar(self, data, all_arithmetic_operators):
+        if all_arithmetic_operators in ("__mod__", "__rmod__"):
+            pytest.skip("ListArray does not implement __mod__ or __rmod__")
+
+        super().test_arith_series_with_scalar(data, all_arithmetic_operators)
+
+    def test_arith_series_with_array(self, data, all_arithmetic_operators, request):
+        if all_arithmetic_operators in ("__mod__", "__rmod__"):
+            pytest.skip("ListArray does not implement __mod__ or __rmod__")
+
+        super().test_arith_series_with_array(data, all_arithmetic_operators)
+
+    def test_arith_frame_with_scalar(self, data, all_arithmetic_operators):
+        if all_arithmetic_operators in ("__mod__", "__rmod__"):
+            pytest.skip("ListArray does not implement __mod__ or __rmod__")
+
+        super().test_arith_frame_with_scalar(data, all_arithmetic_operators)
+
+    def test_divmod(self, data):
+        pytest.skip("ListArray does not implement divmod")
+
 
 def test_to_csv(data):
     # https://github.com/pandas-dev/pandas/issues/28840

From 8f71766a304b10d9392884d8fd744ea30d556ae2 Mon Sep 17 00:00:00 2001
From: Will Ayd <william.ayd@icloud.com>
Date: Fri, 3 Jan 2025 16:02:30 -0500
Subject: [PATCH 13/21] Implement ComparisonOps tests

---
 pandas/tests/extension/list/test_list.py | 16 +++++++++++++++-
 1 file changed, 15 insertions(+), 1 deletion(-)

diff --git a/pandas/tests/extension/list/test_list.py b/pandas/tests/extension/list/test_list.py
index a6f53f9a69e93..b0014cc07b211 100644
--- a/pandas/tests/extension/list/test_list.py
+++ b/pandas/tests/extension/list/test_list.py
@@ -1,3 +1,5 @@
+import operator
+
 import pyarrow as pa
 import pytest
 
@@ -77,7 +79,7 @@ class TestListArray(
     # BaseMethodsTests,
     BaseMissingTests,
     BaseArithmeticOpsTests,
-    # BaseComparisonOpsTests,
+    BaseComparisonOpsTests,
     # BaseUnaryOpsTests,
     BasePrintingTests,
     BaseReduceTests,
@@ -142,6 +144,18 @@ def test_arith_frame_with_scalar(self, data, all_arithmetic_operators):
     def test_divmod(self, data):
         pytest.skip("ListArray does not implement divmod")
 
+    def test_compare_scalar(self, data, comparison_op):
+        if comparison_op in (operator.eq, operator.ne):
+            pytest.skip("Series.combine does not properly handle missing values")
+
+        super().test_compare_scalar(data, comparison_op)
+
+    def test_compare_array(self, data, comparison_op):
+        if comparison_op in (operator.eq, operator.ne):
+            pytest.skip("Series.combine does not properly handle missing values")
+
+        super().test_compare_array(data, comparison_op)
+
 
 def test_to_csv(data):
     # https://github.com/pandas-dev/pandas/issues/28840

From cf2fb6f0b6bce69809f02c3ea796c19bcfd34636 Mon Sep 17 00:00:00 2001
From: Will Ayd <william.ayd@icloud.com>
Date: Fri, 3 Jan 2025 16:03:53 -0500
Subject: [PATCH 14/21] Implement UnaryOps tests

---
 pandas/tests/extension/list/test_list.py | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/pandas/tests/extension/list/test_list.py b/pandas/tests/extension/list/test_list.py
index b0014cc07b211..204afc484dd5e 100644
--- a/pandas/tests/extension/list/test_list.py
+++ b/pandas/tests/extension/list/test_list.py
@@ -80,7 +80,7 @@ class TestListArray(
     BaseMissingTests,
     BaseArithmeticOpsTests,
     BaseComparisonOpsTests,
-    # BaseUnaryOpsTests,
+    BaseUnaryOpsTests,
     BasePrintingTests,
     BaseReduceTests,
     # BaseReshapingTests,
@@ -156,6 +156,9 @@ def test_compare_array(self, data, comparison_op):
 
         super().test_compare_array(data, comparison_op)
 
+    def test_invert(self, data):
+        pytest.skip("ListArray does not implement invert")
+
 
 def test_to_csv(data):
     # https://github.com/pandas-dev/pandas/issues/28840

From 4cef00b443b1eec95f7e1e531dd694545aeb9d87 Mon Sep 17 00:00:00 2001
From: Will Ayd <william.ayd@icloud.com>
Date: Fri, 3 Jan 2025 17:26:01 -0500
Subject: [PATCH 15/21] Implement Reshaping tests

---
 pandas/core/arrays/list_.py              | 24 +++++++-
 pandas/core/frame.py                     | 10 ++++
 pandas/tests/extension/list/test_list.py | 71 +++++++++++++++++++++++-
 3 files changed, 103 insertions(+), 2 deletions(-)

diff --git a/pandas/core/arrays/list_.py b/pandas/core/arrays/list_.py
index 430b37a254d23..d913a315bebd8 100644
--- a/pandas/core/arrays/list_.py
+++ b/pandas/core/arrays/list_.py
@@ -17,6 +17,7 @@
 from pandas.core.arrays.arrow.array import ArrowExtensionArray
 
 if TYPE_CHECKING:
+    from collections.abc import Sequence
     from pandas._typing import (
         type_t,
         ArrayLike,
@@ -47,6 +48,20 @@ def string_to_pyarrow_type(string: str) -> pa.DataType:
     raise ValueError(f"Cannot map {string} to a pyarrow list type")
 
 
+def transpose_homogeneous_list(
+    arrays: Sequence[ListArray],
+) -> list[ListArray]:
+    # TODO: this is the same as transpose_homogeneous_pyarrow
+    # but returns the ListArray instead of an ArrowExtensionArray
+    # should consolidate these
+    arrays = list(arrays)
+    nrows, ncols = len(arrays[0]), len(arrays)
+    indices = np.arange(nrows * ncols).reshape(ncols, nrows).T.reshape(-1)
+    arr = pa.chunked_array([chunk for arr in arrays for chunk in arr._pa_array.chunks])
+    arr = arr.take(indices)
+    return [ListArray(arr.slice(i * ncols, ncols)) for i in range(nrows)]
+
+
 @register_extension_dtype
 @set_module("pandas")
 class ListDtype(ArrowDtype):
@@ -80,7 +95,10 @@ def name(self) -> str:  # type: ignore[override]
         """
         A string identifying the data type.
         """
-        return f"list[{self.pyarrow_dtype.value_type!s}]"
+        # TODO: reshaping tests require the name list to match the large_list
+        # implementation; assumedly there are some astype(str(dtype)) casts
+        # going on. Should fix so this can just be "list[...]" for end user
+        return f"large_list[{self.pyarrow_dtype.value_type!s}]"
 
     @property
     def kind(self) -> str:
@@ -132,6 +150,10 @@ def __init__(
                 else:
                     value_type = pa.array(values).type.value_type
 
+                # Internally always use large_string instead of string
+                if value_type == pa.string():
+                    value_type = pa.large_string()
+
             if not isinstance(values, pa.ChunkedArray):
                 # To support NA, we need to create an Array first :-(
                 arr = pa.array(values, type=pa.large_list(value_type), from_pandas=True)
diff --git a/pandas/core/frame.py b/pandas/core/frame.py
index 3e9be82168bf4..e32355c8fe5f7 100644
--- a/pandas/core/frame.py
+++ b/pandas/core/frame.py
@@ -135,6 +135,7 @@
     PeriodArray,
     TimedeltaArray,
 )
+from pandas.core.arrays.list_ import ListDtype
 from pandas.core.arrays.sparse import SparseFrameAccessor
 from pandas.core.construction import (
     ensure_wrapped_if_datetimelike,
@@ -3800,6 +3801,15 @@ def transpose(
                 new_values = transpose_homogeneous_masked_arrays(
                     cast(Sequence[BaseMaskedArray], self._iter_column_arrays())
                 )
+            elif isinstance(first_dtype, ListDtype):
+                from pandas.core.arrays.list_ import (
+                    ListArray,
+                    transpose_homogeneous_list,
+                )
+
+                new_values = transpose_homogeneous_list(
+                    cast(Sequence[ListArray], self._iter_column_arrays())
+                )
             elif isinstance(first_dtype, ArrowDtype):
                 # We have arrow EAs with the same dtype. We can transpose faster.
                 from pandas.core.arrays.arrow.array import (
diff --git a/pandas/tests/extension/list/test_list.py b/pandas/tests/extension/list/test_list.py
index 204afc484dd5e..e7aa558c04011 100644
--- a/pandas/tests/extension/list/test_list.py
+++ b/pandas/tests/extension/list/test_list.py
@@ -1,3 +1,4 @@
+import itertools
 import operator
 
 import pyarrow as pa
@@ -30,6 +31,7 @@
 )
 from pandas.tests.extension.base.printing import BasePrintingTests
 from pandas.tests.extension.base.reduce import BaseReduceTests
+from pandas.tests.extension.base.reshaping import BaseReshapingTests
 
 # TODO(wayd): This is copied from string tests - is it required here?
 # @pytest.fixture(params=[True, False])
@@ -83,7 +85,7 @@ class TestListArray(
     BaseUnaryOpsTests,
     BasePrintingTests,
     BaseReduceTests,
-    # BaseReshapingTests,
+    BaseReshapingTests,
     # BaseSetitemTests,
     Dim2CompatTests,
 ):
@@ -159,6 +161,73 @@ def test_compare_array(self, data, comparison_op):
     def test_invert(self, data):
         pytest.skip("ListArray does not implement invert")
 
+    def test_merge_on_extension_array(self, data):
+        pytest.skip("ListArray cannot be factorized")
+
+    def test_merge_on_extension_array_duplicates(self, data):
+        pytest.skip("ListArray cannot be factorized")
+
+    @pytest.mark.parametrize(
+        "index",
+        [
+            # Two levels, uniform.
+            pd.MultiIndex.from_product(([["A", "B"], ["a", "b"]]), names=["a", "b"]),
+            # non-uniform
+            pd.MultiIndex.from_tuples([("A", "a"), ("A", "b"), ("B", "b")]),
+            # three levels, non-uniform
+            pd.MultiIndex.from_product([("A", "B"), ("a", "b", "c"), (0, 1, 2)]),
+            pd.MultiIndex.from_tuples(
+                [
+                    ("A", "a", 1),
+                    ("A", "b", 0),
+                    ("A", "a", 0),
+                    ("B", "a", 0),
+                    ("B", "c", 1),
+                ]
+            ),
+        ],
+    )
+    @pytest.mark.parametrize("obj", ["series", "frame"])
+    def test_unstack(self, data, index, obj):
+        # TODO: the base class test casts everything to object
+        # If you remove the object casts, these tests pass...
+        # Check if still needed in base class
+        data = data[: len(index)]
+        if obj == "series":
+            ser = pd.Series(data, index=index)
+        else:
+            ser = pd.DataFrame({"A": data, "B": data}, index=index)
+
+        n = index.nlevels
+        levels = list(range(n))
+        # [0, 1, 2]
+        # [(0,), (1,), (2,), (0, 1), (0, 2), (1, 0), (1, 2), (2, 0), (2, 1)]
+        combinations = itertools.chain.from_iterable(
+            itertools.permutations(levels, i) for i in range(1, n)
+        )
+
+        for level in combinations:
+            result = ser.unstack(level=level)
+            assert all(
+                isinstance(result[col].array, type(data)) for col in result.columns
+            )
+
+            if obj == "series":
+                # We should get the same result with to_frame+unstack+droplevel
+                df = ser.to_frame()
+
+                alt = df.unstack(level=level).droplevel(0, axis=1)
+                tm.assert_frame_equal(result, alt)
+
+            # obj_ser = ser.astype(object)
+
+            expected = ser.unstack(level=level, fill_value=data.dtype.na_value)
+            # if obj == "series":
+            #    assert (expected.dtypes == object).all()
+
+            # result = result.astype(object)
+            tm.assert_frame_equal(result, expected)
+
 
 def test_to_csv(data):
     # https://github.com/pandas-dev/pandas/issues/28840

From 47c7af817bafe783bf31191447ae8644ad94a8be Mon Sep 17 00:00:00 2001
From: Will Ayd <william.ayd@icloud.com>
Date: Fri, 3 Jan 2025 17:29:41 -0500
Subject: [PATCH 16/21] Implement SetItem tests

---
 pandas/core/arrays/list_.py              | 4 ++++
 pandas/tests/extension/list/test_list.py | 3 ++-
 2 files changed, 6 insertions(+), 1 deletion(-)

diff --git a/pandas/core/arrays/list_.py b/pandas/core/arrays/list_.py
index d913a315bebd8..5013005c55ad0 100644
--- a/pandas/core/arrays/list_.py
+++ b/pandas/core/arrays/list_.py
@@ -207,6 +207,10 @@ def __getitem__(self, item):
 
         return type(self)(self._pa_array[item])
 
+    def __setitem__(self, key, value) -> None:
+        msg = "ListArray does not support item assignment via setitem"
+        raise TypeError(msg)
+
     @classmethod
     def _empty(cls, shape: Shape, dtype: ExtensionDtype):
         """
diff --git a/pandas/tests/extension/list/test_list.py b/pandas/tests/extension/list/test_list.py
index e7aa558c04011..e8420d27bb6d7 100644
--- a/pandas/tests/extension/list/test_list.py
+++ b/pandas/tests/extension/list/test_list.py
@@ -32,6 +32,7 @@
 from pandas.tests.extension.base.printing import BasePrintingTests
 from pandas.tests.extension.base.reduce import BaseReduceTests
 from pandas.tests.extension.base.reshaping import BaseReshapingTests
+from pandas.tests.extension.base.setitem import BaseSetitemTests
 
 # TODO(wayd): This is copied from string tests - is it required here?
 # @pytest.fixture(params=[True, False])
@@ -86,7 +87,7 @@ class TestListArray(
     BasePrintingTests,
     BaseReduceTests,
     BaseReshapingTests,
-    # BaseSetitemTests,
+    BaseSetitemTests,
     Dim2CompatTests,
 ):
     # TODO(wayd): The tests here are copied from test_arrow.py

From 4a5da0ccc00a2459e0f3ce08c1396fdd4be394af Mon Sep 17 00:00:00 2001
From: Will Ayd <william.ayd@icloud.com>
Date: Fri, 3 Jan 2025 18:52:43 -0500
Subject: [PATCH 17/21] Implement GetItem tests

---
 pandas/core/arrays/arrow/array.py        |  13 ++-
 pandas/core/arrays/list_.py              | 108 +++++++++++++++++++++--
 pandas/core/generic.py                   |  26 ++++--
 pandas/tests/extension/list/test_list.py |  23 ++---
 4 files changed, 140 insertions(+), 30 deletions(-)

diff --git a/pandas/core/arrays/arrow/array.py b/pandas/core/arrays/arrow/array.py
index afa219f611992..441e3bce9bda9 100644
--- a/pandas/core/arrays/arrow/array.py
+++ b/pandas/core/arrays/arrow/array.py
@@ -428,7 +428,7 @@ def _box_pa_scalar(cls, value, pa_type: pa.DataType | None = None) -> pa.Scalar:
         """
         if isinstance(value, pa.Scalar):
             pa_scalar = value
-        elif isna(value):
+        elif not is_list_like(value) and isna(value):
             pa_scalar = pa.scalar(None, type=pa_type)
         else:
             # Workaround https://github.com/apache/arrow/issues/37291
@@ -1350,7 +1350,16 @@ def take(
                 # TODO(ARROW-9433): Treat negative indices as NULL
                 indices_array = pa.array(indices_array, mask=fill_mask)
                 result = self._pa_array.take(indices_array)
-                if isna(fill_value):
+                if is_list_like(fill_value):
+                    # TODO: this should be hit by ListArray. Ideally we do:
+                    # pc.replace_with_mask(result, fill_mask, pa.scalar(fill_value))
+                    # but pyarrow does not yet implement that for list types
+                    new_values = [
+                        fill_value if should_fill else x.as_py()
+                        for x, should_fill in zip(result, fill_mask)
+                    ]
+                    return type(self)(new_values)
+                elif isna(fill_value):
                     return type(self)(result)
                 # TODO: ArrowNotImplementedError: Function fill_null has no
                 # kernel matching input types (array[string], scalar[string])
diff --git a/pandas/core/arrays/list_.py b/pandas/core/arrays/list_.py
index 5013005c55ad0..eeb62b3e50656 100644
--- a/pandas/core/arrays/list_.py
+++ b/pandas/core/arrays/list_.py
@@ -11,10 +11,15 @@
     ExtensionDtype,
     register_extension_dtype,
 )
-from pandas.core.dtypes.common import is_string_dtype
+from pandas.core.dtypes.common import (
+    is_bool_dtype,
+    is_integer_dtype,
+    is_string_dtype,
+)
 from pandas.core.dtypes.dtypes import ArrowDtype
 
 from pandas.core.arrays.arrow.array import ArrowExtensionArray
+from pandas.core.arrays.base import ExtensionArray
 
 if TYPE_CHECKING:
     from collections.abc import Sequence
@@ -146,6 +151,15 @@ def __init__(
         else:
             if value_type is None:
                 if isinstance(values, (pa.Array, pa.ChunkedArray)):
+                    parent_type = values.type
+                    if not isinstance(parent_type, (pa.ListType, pa.LargeListType)):
+                        # Ideally could cast here, but I don't think pyarrow implements
+                        # many list casts
+                        new_values = [
+                            [x.as_py()] if x.is_valid else None for x in values
+                        ]
+                        values = pa.array(new_values, type=pa.large_list(parent_type))
+
                     value_type = values.type.value_type
                 else:
                     value_type = pa.array(values).type.value_type
@@ -193,19 +207,89 @@ def _from_sequence(cls, scalars, *, dtype=None, copy: bool = False):
 
         return cls(values)
 
+    @classmethod
+    def _box_pa(
+        cls, value, pa_type: pa.DataType | None = None
+    ) -> pa.Array | pa.ChunkedArray | pa.Scalar:
+        """
+        Box value into a pyarrow Array, ChunkedArray or Scalar.
+
+        Parameters
+        ----------
+        value : any
+        pa_type : pa.DataType | None
+
+        Returns
+        -------
+        pa.Array or pa.ChunkedArray or pa.Scalar
+        """
+        if (
+            isinstance(value, (pa.ListScalar, pa.LargeListScalar))
+            or isinstance(value, list)
+            or value is None
+        ):
+            return cls._box_pa_scalar(value, pa_type)
+        return cls._box_pa_array(value, pa_type)
+
     def __getitem__(self, item):
         # PyArrow does not support NumPy's selection with an equal length
         # mask, so let's convert those to integral positions if needed
-        if isinstance(item, np.ndarray) and item.dtype == bool:
-            pos = np.array(range(len(item)))
-            mask = pos[item]
-            return type(self)(self._pa_array.take(mask))
+        if isinstance(item, (np.ndarray, ExtensionArray)):
+            if is_bool_dtype(item.dtype):
+                mask_len = len(item)
+                if mask_len != len(self):
+                    raise IndexError(
+                        f"Boolean index has wrong length: {mask_len} "
+                        f"instead of {len(self)}"
+                    )
+                pos = np.array(range(len(item)))
+
+                if isinstance(item, ExtensionArray):
+                    mask = pos[item.fillna(False)]
+                else:
+                    mask = pos[item]
+                return type(self)(self._pa_array.take(mask))
+            elif is_integer_dtype(item.dtype):
+                if isinstance(item, ExtensionArray) and item.isna().any():
+                    msg = "Cannot index with an integer indexer containing NA values"
+                    raise ValueError(msg)
+
+                indexer = pa.array(item)
+                return type(self)(self._pa_array.take(indexer))
         elif isinstance(item, int):
-            return self._pa_array[item]
+            value = self._pa_array[item]
+            if value.is_valid:
+                return value.as_py()
+            else:
+                return self.dtype.na_value
         elif isinstance(item, list):
-            return type(self)(self._pa_array.take(item))
+            # pyarrow does not support taking yet from an empty list
+            # https://github.com/apache/arrow/issues/39917
+            if item:
+                try:
+                    result = self._pa_array.take(item)
+                except pa.lib.ArrowInvalid as e:
+                    if "Could not convert <NA>" in str(e):
+                        msg = (
+                            "Cannot index with an integer indexer containing NA values"
+                        )
+                        raise ValueError(msg) from e
+                    raise e
+            else:
+                result = pa.array([], type=self._pa_array.type)
+
+            return type(self)(result)
+
+        try:
+            result = type(self)(self._pa_array[item])
+        except TypeError as e:
+            msg = (
+                "only integers, slices (`:`), ellipsis (`...`), numpy.newaxis "
+                "(`None`) and integer or boolean arrays are valid indices"
+            )
+            raise IndexError(msg) from e
 
-        return type(self)(self._pa_array[item])
+        return result
 
     def __setitem__(self, key, value) -> None:
         msg = "ListArray does not support item assignment via setitem"
@@ -241,7 +325,13 @@ def astype(self, dtype: AstypeArg, copy: bool = True) -> ArrayLike:
         return super().astype(dtype, copy)
 
     def __eq__(self, other):
-        if isinstance(other, (pa.ListScalar, pa.LargeListScalar)):
+        if isinstance(other, list):
+            from pandas.arrays import BooleanArray
+
+            mask = np.array([False] * len(self))
+            values = np.array([x.as_py() == other for x in self._pa_array])
+            return BooleanArray(values, mask)
+        elif isinstance(other, (pa.ListScalar, pa.LargeListScalar)):
             from pandas.arrays import BooleanArray
 
             # TODO: pyarrow.compute does not implement broadcasting equality
diff --git a/pandas/core/generic.py b/pandas/core/generic.py
index 42c2bddba02e9..438f349c152b1 100644
--- a/pandas/core/generic.py
+++ b/pandas/core/generic.py
@@ -23,7 +23,6 @@
 import warnings
 
 import numpy as np
-import pyarrow as pa
 
 from pandas._config import config
 
@@ -150,6 +149,7 @@
 )
 from pandas.core.array_algos.replace import should_use_regex
 from pandas.core.arrays import ExtensionArray
+from pandas.core.arrays.list_ import ListDtype
 from pandas.core.base import PandasObject
 from pandas.core.construction import extract_array
 from pandas.core.flags import Flags
@@ -7013,11 +7013,20 @@ def fillna(
                         stacklevel=2,
                     )
 
+        holds_list_array = False
+        if isinstance(self, ABCSeries) and isinstance(self.dtype, ListDtype):
+            holds_list_array = True
+        elif isinstance(self, ABCDataFrame) and any(
+            isinstance(x, ListDtype) for x in self.dtypes
+        ):
+            holds_list_array = True
+
         if isinstance(value, (list, tuple)):
-            raise TypeError(
-                '"value" parameter must be a scalar or dict, but '
-                f'you passed a "{type(value).__name__}"'
-            )
+            if not holds_list_array:
+                raise TypeError(
+                    '"value" parameter must be a scalar or dict, but '
+                    f'you passed a "{type(value).__name__}"'
+                )
 
         # set the default here, so functions examining the signature
         # can detect if something was set (e.g. in groupby) (GH9221)
@@ -7037,8 +7046,9 @@ def fillna(
                 value = Series(value)
                 value = value.reindex(self.index)
                 value = value._values
-            elif isinstance(value, pa.ListScalar) or not is_list_like(value):
-                # TODO(wayd): maybe is_list_like should return false for ListScalar?
+            elif (
+                isinstance(value, list) and isinstance(self.dtype, ListDtype)
+            ) or not is_list_like(value):
                 pass
             else:
                 raise TypeError(
@@ -7102,7 +7112,7 @@ def fillna(
             else:
                 return result
 
-        elif isinstance(value, pa.ListScalar) or not is_list_like(value):
+        elif holds_list_array or not is_list_like(value):
             if axis == 1:
                 result = self.T.fillna(value=value, limit=limit).T
                 new_data = result._mgr
diff --git a/pandas/tests/extension/list/test_list.py b/pandas/tests/extension/list/test_list.py
index e8420d27bb6d7..206d6cec26fda 100644
--- a/pandas/tests/extension/list/test_list.py
+++ b/pandas/tests/extension/list/test_list.py
@@ -18,6 +18,7 @@
     NDArrayBacked2DTests,
 )
 from pandas.tests.extension.base.dtype import BaseDtypeTests
+from pandas.tests.extension.base.getitem import BaseGetitemTests
 from pandas.tests.extension.base.groupby import BaseGroupbyTests
 from pandas.tests.extension.base.index import BaseIndexTests
 from pandas.tests.extension.base.interface import BaseInterfaceTests
@@ -49,7 +50,7 @@ def dtype():
 def data():
     """Length-100 ListArray for semantics test."""
     # TODO: make better random data
-    data = [list("a"), list("ab"), list("abc")] * 33 + [None]
+    data = [list("a"), list("ab"), list("abc")] * 33 + [list("a")]
     return ListArray(data)
 
 
@@ -74,7 +75,7 @@ class TestListArray(
     BaseCastingTests,
     BaseConstructorsTests,
     BaseDtypeTests,
-    # BaseGetitemTests,
+    BaseGetitemTests,
     BaseGroupbyTests,
     BaseIndexTests,
     BaseInterfaceTests,
@@ -90,12 +91,12 @@ class TestListArray(
     BaseSetitemTests,
     Dim2CompatTests,
 ):
-    # TODO(wayd): The tests here are copied from test_arrow.py
-    # It appears the TestArrowArray class has different expectations around
-    # when copies should be made then the base.ExtensionTests
-    # Assuming intentional, maybe in the long term this should just
-    # inherit from TestArrowArray
     def test_fillna_no_op_returns_copy(self, data):
+        # TODO(wayd): This test is copied from test_arrow.py
+        # It appears the TestArrowArray class has different expectations around
+        # when copies should be made then the base.ExtensionTests
+        # Assuming intentional, maybe in the long term this should just
+        # inherit from TestArrowArray
         data = data[~data.isna()]
 
         valid = data[0]
@@ -154,10 +155,7 @@ def test_compare_scalar(self, data, comparison_op):
         super().test_compare_scalar(data, comparison_op)
 
     def test_compare_array(self, data, comparison_op):
-        if comparison_op in (operator.eq, operator.ne):
-            pytest.skip("Series.combine does not properly handle missing values")
-
-        super().test_compare_array(data, comparison_op)
+        pytest.skip("ListArray comparison ops are not implemented")
 
     def test_invert(self, data):
         pytest.skip("ListArray does not implement invert")
@@ -229,6 +227,9 @@ def test_unstack(self, data, index, obj):
             # result = result.astype(object)
             tm.assert_frame_equal(result, expected)
 
+    def test_getitem_ellipsis_and_slice(self, data):
+        pytest.skip("ListArray does not support NumPy style ellipsis slicing nor 2-D")
+
 
 def test_to_csv(data):
     # https://github.com/pandas-dev/pandas/issues/28840

From 305fee32eb097eae31ac46c104d6a98f200e6f5a Mon Sep 17 00:00:00 2001
From: Will Ayd <william.ayd@icloud.com>
Date: Fri, 3 Jan 2025 20:18:30 -0500
Subject: [PATCH 18/21] Implement Methods tests

---
 pandas/tests/extension/list/test_list.py | 96 +++++++++++++++++++++++-
 1 file changed, 95 insertions(+), 1 deletion(-)

diff --git a/pandas/tests/extension/list/test_list.py b/pandas/tests/extension/list/test_list.py
index 206d6cec26fda..64e0f0aad0287 100644
--- a/pandas/tests/extension/list/test_list.py
+++ b/pandas/tests/extension/list/test_list.py
@@ -23,6 +23,7 @@
 from pandas.tests.extension.base.index import BaseIndexTests
 from pandas.tests.extension.base.interface import BaseInterfaceTests
 from pandas.tests.extension.base.io import BaseParsingTests
+from pandas.tests.extension.base.methods import BaseMethodsTests
 from pandas.tests.extension.base.missing import BaseMissingTests
 from pandas.tests.extension.base.ops import (  # noqa: F401
     BaseArithmeticOpsTests,
@@ -61,6 +62,28 @@ def data_missing(dtype):
     return arr
 
 
+@pytest.fixture
+def data_for_sorting(data_for_grouping):
+    """
+    Length-3 array with a known sort order.
+
+    This should be three items [B, C, A] with
+    A < B < C
+    """
+    pytest.skip("ListArray does not support sorting")
+
+
+@pytest.fixture
+def data_missing_for_sorting(data_for_grouping):
+    """
+    Length-3 array with a known sort order.
+
+    This should be three items [B, NA, A] with
+    A < B and NA missing.
+    """
+    pytest.skip("ListArray does not support sorting")
+
+
 @pytest.fixture
 def data_for_grouping(dtype):
     A = ["a"]
@@ -80,7 +103,7 @@ class TestListArray(
     BaseIndexTests,
     BaseInterfaceTests,
     BaseParsingTests,
-    # BaseMethodsTests,
+    BaseMethodsTests,
     BaseMissingTests,
     BaseArithmeticOpsTests,
     BaseComparisonOpsTests,
@@ -230,6 +253,77 @@ def test_unstack(self, data, index, obj):
     def test_getitem_ellipsis_and_slice(self, data):
         pytest.skip("ListArray does not support NumPy style ellipsis slicing nor 2-D")
 
+    def test_hash_pandas_object(self, data):
+        pytest.skip("ListArray does not support this")
+
+    @pytest.mark.parametrize("dropna", [True, False])
+    def test_value_counts(self, all_data, dropna):
+        pytest.skip("ListArray does not support this")
+
+    def test_value_counts_with_normalize(self, data):
+        pytest.skip("ListArray does not support this")
+
+    @pytest.mark.parametrize("na_action", [None, "ignore"])
+    def test_map(self, data_missing, na_action):
+        pytest.skip("ListArray does not support this")
+
+    @pytest.mark.parametrize("keep", ["first", "last", False])
+    def test_duplicated(self, data, keep):
+        pytest.skip("ListArray does not support this")
+
+    @pytest.mark.parametrize("box", [pd.Series, lambda x: x])
+    @pytest.mark.parametrize("method", [lambda x: x.unique(), pd.unique])
+    def test_unique(self, data, box, method):
+        pytest.skip("ListArray does not support this")
+
+    def test_factorize(self, data_for_grouping):
+        pytest.skip("ListArray does not support this")
+
+    def test_factorize_equivalence(self, data_for_grouping):
+        pytest.skip("ListArray does not support this")
+
+    def test_factorize_empty(self, data):
+        pytest.skip("ListArray does not support this")
+
+    def test_fillna_limit_frame(self, data_missing):
+        pytest.skip("Needs review - can assignment be avoided?")
+
+    def test_fillna_limit_series(self, data_missing):
+        pytest.skip("Needs review - can assignment be avoided?")
+
+    def test_fillna_copy_frame(self, data_missing):
+        pytest.skip("Needs review - can assignment be avoided?")
+
+    def test_fillna_copy_series(self, data_missing):
+        pytest.skip("Needs review - can assignment be avoided?")
+
+    def test_combine_le(self, data_repeated):
+        pytest.skip("Needs review - can assignment be avoided?")
+
+    def test_combine_first(self, data):
+        pytest.skip("Needs review - can assignment be avoided?")
+
+    def test_shift_0_periods(self, data):
+        pytest.skip("Needs review - can assignment be avoided?")
+
+    def test_hash_pandas_object_works(self, data, as_frame):
+        pytest.skip("ListArray does not support this")
+
+    def test_where_series(self, data, na_value, as_frame):
+        pytest.skip("Needs review - can assignment be avoided?")
+
+    def test_argsort(self, data_for_sorting):
+        pytest.skip("ListArray does not support this")
+
+    def test_argsort_missing_array(self, data_missing_for_sorting):
+        pytest.skip("ListArray does not support this")
+
+    def test_argsort_missing(self, data_missing_for_sorting):
+        pytest.skip("ListArray does not support this")
+
+    def test_argmin_argmax(self, data_for_sorting, data_missing_for_sorting, na_value):
+        pytest.skip("ListArray does not support this")
+
 
 def test_to_csv(data):
     # https://github.com/pandas-dev/pandas/issues/28840

From 25087f793704b63aaa731f52aa80ca12de34cae1 Mon Sep 17 00:00:00 2001
From: Will Ayd <william.ayd@icloud.com>
Date: Sat, 4 Jan 2025 01:47:01 -0500
Subject: [PATCH 19/21] Brock feedback

---
 pandas/_libs/lib.pyx        |  3 +--
 pandas/io/formats/format.py | 21 ---------------------
 2 files changed, 1 insertion(+), 23 deletions(-)

diff --git a/pandas/_libs/lib.pyx b/pandas/_libs/lib.pyx
index 10a6e0443f45d..7eaa9b17ee2a1 100644
--- a/pandas/_libs/lib.pyx
+++ b/pandas/_libs/lib.pyx
@@ -834,8 +834,7 @@ cpdef ndarray[object] ensure_string_array(
             if isinstance(val, bytes):
                 # GH#49658 discussion of desired behavior here
                 result[i] = val.decode()
-            elif isinstance(val, np.ndarray):
-                # TODO(wayd): is_float_object actually returns true for this...
+            elif util.is_array(val):
                 result[i] = str(val.tolist())
             elif not util.is_float_object(val):
                 # f"{val}" is faster than str(val)
diff --git a/pandas/io/formats/format.py b/pandas/io/formats/format.py
index 70acaf5498e8d..46ecb2b9a8f12 100644
--- a/pandas/io/formats/format.py
+++ b/pandas/io/formats/format.py
@@ -1467,27 +1467,6 @@ def _format_strings(self) -> list[str]:
         return fmt_values
 
 
-class _NullFormatter(_GenericArrayFormatter):
-    def _format_strings(self) -> list[str]:
-        fmt_values = [str(x) for x in self.values]
-        return fmt_values
-
-
-class _ListFormatter(_GenericArrayFormatter):
-    def _format_strings(self) -> list[str]:
-        # TODO(wayd): This doesn't seem right - where should missing values
-        # be handled
-        fmt_values = []
-        for x in self.values:
-            pyval = x.as_py()
-            if pyval:
-                fmt_values.append(pyval)
-            else:
-                fmt_values.append("")
-
-        return fmt_values
-
-
 class _Datetime64Formatter(_GenericArrayFormatter):
     values: DatetimeArray
 

From 5a2b113526901adad590e66bf5d60f6fa1856467 Mon Sep 17 00:00:00 2001
From: Will Ayd <william.ayd@icloud.com>
Date: Sat, 4 Jan 2025 02:07:03 -0500
Subject: [PATCH 20/21] Assorted cleanups

---
 pandas/core/arrays/list_.py              | 26 +++---------
 pandas/core/internals/construction.py    |  3 --
 pandas/core/internals/managers.py        |  5 +--
 pandas/tests/extension/list/test_list.py | 53 +-----------------------
 4 files changed, 9 insertions(+), 78 deletions(-)

diff --git a/pandas/core/arrays/list_.py b/pandas/core/arrays/list_.py
index eeb62b3e50656..7a15b41739f79 100644
--- a/pandas/core/arrays/list_.py
+++ b/pandas/core/arrays/list_.py
@@ -74,7 +74,7 @@ class ListDtype(ArrowDtype):
     An ExtensionDtype suitable for storing homogeneous lists of data.
     """
 
-    _is_immutable = True  # TODO(wayd): should we allow mutability?
+    _is_immutable = True
 
     def __init__(self, value_dtype: pa.DataType) -> None:
         super().__init__(pa.large_list(value_dtype))
@@ -100,10 +100,7 @@ def name(self) -> str:  # type: ignore[override]
         """
         A string identifying the data type.
         """
-        # TODO: reshaping tests require the name list to match the large_list
-        # implementation; assumedly there are some astype(str(dtype)) casts
-        # going on. Should fix so this can just be "list[...]" for end user
-        return f"large_list[{self.pyarrow_dtype.value_type!s}]"
+        return f"list[{self.pyarrow_dtype.value_type!s}]"
 
     @property
     def kind(self) -> str:
@@ -124,7 +121,6 @@ def construct_array_type(cls) -> type_t[ListArray]:
         return ListArray
 
     def _get_common_dtype(self, dtypes: list[DtypeObj]) -> DtypeObj | None:
-        # TODO(wayd): should we implemented value type support?
         for dtype in dtypes:
             if (
                 isinstance(dtype, ListDtype)
@@ -153,8 +149,7 @@ def __init__(
                 if isinstance(values, (pa.Array, pa.ChunkedArray)):
                     parent_type = values.type
                     if not isinstance(parent_type, (pa.ListType, pa.LargeListType)):
-                        # Ideally could cast here, but I don't think pyarrow implements
-                        # many list casts
+                        # TODO: maybe implement native casts in pyarrow
                         new_values = [
                             [x.as_py()] if x.is_valid else None for x in values
                         ]
@@ -164,12 +159,10 @@ def __init__(
                 else:
                     value_type = pa.array(values).type.value_type
 
-                # Internally always use large_string instead of string
                 if value_type == pa.string():
                     value_type = pa.large_string()
 
             if not isinstance(values, pa.ChunkedArray):
-                # To support NA, we need to create an Array first :-(
                 arr = pa.array(values, type=pa.large_list(value_type), from_pandas=True)
                 self._pa_array = pa.chunked_array(arr, type=pa.large_list(value_type))
             else:
@@ -200,8 +193,6 @@ def _from_sequence(cls, scalars, *, dtype=None, copy: bool = False):
             values = pa.array(scalars, from_pandas=True)
 
         if values.type == "null" and dtype is not None:
-            # TODO: the sequencing here seems wrong; just making the tests pass for now
-            # but this needs a comprehensive review
             pa_type = string_to_pyarrow_type(str(dtype))
             values = pa.array(values, type=pa_type)
 
@@ -232,8 +223,6 @@ def _box_pa(
         return cls._box_pa_array(value, pa_type)
 
     def __getitem__(self, item):
-        # PyArrow does not support NumPy's selection with an equal length
-        # mask, so let's convert those to integral positions if needed
         if isinstance(item, (np.ndarray, ExtensionArray)):
             if is_bool_dtype(item.dtype):
                 mask_len = len(item)
@@ -305,9 +294,6 @@ def _empty(cls, shape: Shape, dtype: ExtensionDtype):
         ExtensionDtype.empty
             ExtensionDtype.empty is the 'official' public version of this API.
         """
-        # Implementer note: while ExtensionDtype.empty is the public way to
-        # call this method, it is still required to implement this `_empty`
-        # method as well (it is called internally in pandas)
         if isinstance(shape, tuple):
             if len(shape) > 1:
                 raise ValueError("ListArray may only be 1-D")
@@ -334,9 +320,9 @@ def __eq__(self, other):
         elif isinstance(other, (pa.ListScalar, pa.LargeListScalar)):
             from pandas.arrays import BooleanArray
 
-            # TODO: pyarrow.compute does not implement broadcasting equality
-            # for an array of lists to a listscalar
-            # TODO: pyarrow doesn't compare missing values as missing???
+            # TODO: pyarrow.compute does not implement equal for lists
+            # https://github.com/apache/arrow/issues/45167
+            # TODO: pyarrow doesn't compare missing values in Python as missing???
             # arr = pa.array([1, 2, None])
             # pc.equal(arr, arr[2]) returns all nulls but
             # arr[2] == arr[2] returns True
diff --git a/pandas/core/internals/construction.py b/pandas/core/internals/construction.py
index 873d373e8bf59..af038c2d6751f 100644
--- a/pandas/core/internals/construction.py
+++ b/pandas/core/internals/construction.py
@@ -13,7 +13,6 @@
 
 import numpy as np
 from numpy import ma
-import pyarrow as pa
 
 from pandas._config import using_string_dtype
 
@@ -462,8 +461,6 @@ def treat_as_nested(data, dtype) -> bool:
         len(data) > 0
         and is_list_like(data[0])
         and getattr(data[0], "ndim", 1) == 1
-        # TODO(wayd): hack so pyarrow list elements don't expand
-        and not isinstance(data[0], pa.ListScalar)
         and not isinstance(dtype, ListDtype)
         and not (isinstance(data, ExtensionArray) and data.ndim == 2)
     )
diff --git a/pandas/core/internals/managers.py b/pandas/core/internals/managers.py
index 9dc31c3cbf86f..a3738bb25f56c 100644
--- a/pandas/core/internals/managers.py
+++ b/pandas/core/internals/managers.py
@@ -1976,10 +1976,7 @@ def from_blocks(
 
     @classmethod
     def from_array(
-        cls,
-        array: ArrayLike,
-        index: Index,
-        refs: BlockValuesRefs | None = None,
+        cls, array: ArrayLike, index: Index, refs: BlockValuesRefs | None = None
     ) -> SingleBlockManager:
         """
         Constructor for if we have an array that is not yet a Block.
diff --git a/pandas/tests/extension/list/test_list.py b/pandas/tests/extension/list/test_list.py
index 64e0f0aad0287..33d6303796e04 100644
--- a/pandas/tests/extension/list/test_list.py
+++ b/pandas/tests/extension/list/test_list.py
@@ -10,36 +10,7 @@
     ListArray,
     ListDtype,
 )
-from pandas.tests.extension.base.accumulate import BaseAccumulateTests
-from pandas.tests.extension.base.casting import BaseCastingTests
-from pandas.tests.extension.base.constructors import BaseConstructorsTests
-from pandas.tests.extension.base.dim2 import (  # noqa: F401
-    Dim2CompatTests,
-    NDArrayBacked2DTests,
-)
-from pandas.tests.extension.base.dtype import BaseDtypeTests
-from pandas.tests.extension.base.getitem import BaseGetitemTests
-from pandas.tests.extension.base.groupby import BaseGroupbyTests
-from pandas.tests.extension.base.index import BaseIndexTests
-from pandas.tests.extension.base.interface import BaseInterfaceTests
-from pandas.tests.extension.base.io import BaseParsingTests
-from pandas.tests.extension.base.methods import BaseMethodsTests
-from pandas.tests.extension.base.missing import BaseMissingTests
-from pandas.tests.extension.base.ops import (  # noqa: F401
-    BaseArithmeticOpsTests,
-    BaseComparisonOpsTests,
-    BaseOpsUtil,
-    BaseUnaryOpsTests,
-)
-from pandas.tests.extension.base.printing import BasePrintingTests
-from pandas.tests.extension.base.reduce import BaseReduceTests
-from pandas.tests.extension.base.reshaping import BaseReshapingTests
-from pandas.tests.extension.base.setitem import BaseSetitemTests
-
-# TODO(wayd): This is copied from string tests - is it required here?
-# @pytest.fixture(params=[True, False])
-# def chunked(request):
-#     return request.param
+from pandas.tests.extension import base
 
 
 @pytest.fixture
@@ -93,27 +64,7 @@ def data_for_grouping(dtype):
     return ListArray([B, B, NA, NA, A, A, B, C])
 
 
-class TestListArray(
-    BaseAccumulateTests,
-    BaseCastingTests,
-    BaseConstructorsTests,
-    BaseDtypeTests,
-    BaseGetitemTests,
-    BaseGroupbyTests,
-    BaseIndexTests,
-    BaseInterfaceTests,
-    BaseParsingTests,
-    BaseMethodsTests,
-    BaseMissingTests,
-    BaseArithmeticOpsTests,
-    BaseComparisonOpsTests,
-    BaseUnaryOpsTests,
-    BasePrintingTests,
-    BaseReduceTests,
-    BaseReshapingTests,
-    BaseSetitemTests,
-    Dim2CompatTests,
-):
+class TestListArray(base.ExtensionTests):
     def test_fillna_no_op_returns_copy(self, data):
         # TODO(wayd): This test is copied from test_arrow.py
         # It appears the TestArrowArray class has different expectations around

From cc345db3f106925919208339d05d24ced45dabb8 Mon Sep 17 00:00:00 2001
From: Will Ayd <william.ayd@icloud.com>
Date: Sat, 4 Jan 2025 02:36:05 -0500
Subject: [PATCH 21/21] Update list accessor tests

---
 pandas/core/arrays/arrow/accessors.py         |  6 ++--
 pandas/core/arrays/list_.py                   |  6 +---
 .../series/accessors/test_list_accessor.py    | 35 ++++++++++---------
 3 files changed, 24 insertions(+), 23 deletions(-)

diff --git a/pandas/core/arrays/arrow/accessors.py b/pandas/core/arrays/arrow/accessors.py
index b220a94d032b5..e5ee23906ddf4 100644
--- a/pandas/core/arrays/arrow/accessors.py
+++ b/pandas/core/arrays/arrow/accessors.py
@@ -18,6 +18,8 @@
 
 from pandas.core.dtypes.common import is_list_like
 
+from pandas.core.arrays.list_ import ListDtype
+
 if not pa_version_under10p1:
     import pyarrow as pa
     import pyarrow.compute as pc
@@ -106,7 +108,7 @@ def len(self) -> Series:
         ...         [1, 2, 3],
         ...         [3],
         ...     ],
-        ...     dtype=pd.ArrowDtype(pa.list_(pa.int64())),
+        ...     dtype=pd.ListDtype(pa.int64()),
         ... )
         >>> s.list.len()
         0    3
@@ -189,7 +191,7 @@ def __getitem__(self, key: int | slice) -> Series:
             sliced = pc.list_slice(self._pa_array, start, stop, step)
             return Series(
                 sliced,
-                dtype=ArrowDtype(sliced.type),
+                dtype=ListDtype(sliced.type.value_type),
                 index=self._data.index,
                 name=self._data.name,
             )
diff --git a/pandas/core/arrays/list_.py b/pandas/core/arrays/list_.py
index 7a15b41739f79..bfddbe5ce2c07 100644
--- a/pandas/core/arrays/list_.py
+++ b/pandas/core/arrays/list_.py
@@ -186,11 +186,7 @@ def _from_sequence(cls, scalars, *, dtype=None, copy: bool = False):
             # TypeError: object of type 'NoneType' has no len() if you have
             # pa.ListScalar(None). Upstream issue in Arrow - see:
             # https://github.com/apache/arrow/issues/40319
-            for i in range(len(scalars)):
-                if not scalars[i].is_valid:
-                    scalars[i] = None
-
-            values = pa.array(scalars, from_pandas=True)
+            values = pa.array(scalars.to_pylist(), from_pandas=True)
 
         if values.type == "null" and dtype is not None:
             pa_type = string_to_pyarrow_type(str(dtype))
diff --git a/pandas/tests/series/accessors/test_list_accessor.py b/pandas/tests/series/accessors/test_list_accessor.py
index bec8ca13a2f5f..909af8ee7c1d9 100644
--- a/pandas/tests/series/accessors/test_list_accessor.py
+++ b/pandas/tests/series/accessors/test_list_accessor.py
@@ -4,6 +4,7 @@
 
 from pandas import (
     ArrowDtype,
+    ListDtype,
     Series,
 )
 import pandas._testing as tm
@@ -16,15 +17,16 @@
 @pytest.mark.parametrize(
     "list_dtype",
     (
-        pa.list_(pa.int64()),
-        pa.list_(pa.int64(), list_size=3),
-        pa.large_list(pa.int64()),
+        ArrowDtype(pa.list_(pa.int64())),
+        ArrowDtype(pa.list_(pa.int64(), list_size=3)),
+        ArrowDtype(pa.large_list(pa.int64())),
+        ListDtype(pa.int64()),
     ),
 )
 def test_list_getitem(list_dtype):
     ser = Series(
         [[1, 2, 3], [4, None, 5], None],
-        dtype=ArrowDtype(list_dtype),
+        dtype=list_dtype,
         name="a",
     )
     actual = ser.list[1]
@@ -36,7 +38,7 @@ def test_list_getitem_index():
     # GH 58425
     ser = Series(
         [[1, 2, 3], [4, None, 5], None],
-        dtype=ArrowDtype(pa.list_(pa.int64())),
+        dtype=ListDtype(pa.int64()),
         index=[1, 3, 7],
         name="a",
     )
@@ -53,7 +55,7 @@ def test_list_getitem_index():
 def test_list_getitem_slice():
     ser = Series(
         [[1, 2, 3], [4, None, 5], None],
-        dtype=ArrowDtype(pa.list_(pa.int64())),
+        dtype=ListDtype(pa.int64()),
         index=[1, 3, 7],
         name="a",
     )
@@ -66,7 +68,7 @@ def test_list_getitem_slice():
         actual = ser.list[1:None:None]
         expected = Series(
             [[2, 3], [None, 5], None],
-            dtype=ArrowDtype(pa.list_(pa.int64())),
+            dtype=ListDtype(pa.int64()),
             index=[1, 3, 7],
             name="a",
         )
@@ -76,18 +78,18 @@ def test_list_getitem_slice():
 def test_list_len():
     ser = Series(
         [[1, 2, 3], [4, None], None],
-        dtype=ArrowDtype(pa.list_(pa.int64())),
+        dtype=ListDtype(pa.int64()),
         name="a",
     )
     actual = ser.list.len()
-    expected = Series([3, 2, None], dtype=ArrowDtype(pa.int32()), name="a")
+    expected = Series([3, 2, None], dtype=ArrowDtype(pa.int64()), name="a")
     tm.assert_series_equal(actual, expected)
 
 
 def test_list_flatten():
     ser = Series(
         [[1, 2, 3], None, [4, None], [], [7, 8]],
-        dtype=ArrowDtype(pa.list_(pa.int64())),
+        dtype=ListDtype(pa.int64()),
         name="a",
     )
     actual = ser.list.flatten()
@@ -103,7 +105,7 @@ def test_list_flatten():
 def test_list_getitem_slice_invalid():
     ser = Series(
         [[1, 2, 3], [4, None, 5], None],
-        dtype=ArrowDtype(pa.list_(pa.int64())),
+        dtype=ListDtype(pa.int64()),
     )
     if pa_version_under11p0:
         with pytest.raises(
@@ -133,15 +135,16 @@ def test_list_accessor_non_list_dtype():
 @pytest.mark.parametrize(
     "list_dtype",
     (
-        pa.list_(pa.int64()),
-        pa.list_(pa.int64(), list_size=3),
-        pa.large_list(pa.int64()),
+        ArrowDtype(pa.list_(pa.int64())),
+        ArrowDtype(pa.list_(pa.int64(), list_size=3)),
+        ArrowDtype(pa.large_list(pa.int64())),
+        ListDtype(pa.int64()),
     ),
 )
 def test_list_getitem_invalid_index(list_dtype):
     ser = Series(
         [[1, 2, 3], [4, None, 5], None],
-        dtype=ArrowDtype(list_dtype),
+        dtype=list_dtype,
     )
     with pytest.raises(pa.lib.ArrowInvalid, match="Index -1 is out of bounds"):
         ser.list[-1]
@@ -154,7 +157,7 @@ def test_list_getitem_invalid_index(list_dtype):
 def test_list_accessor_not_iterable():
     ser = Series(
         [[1, 2, 3], [4, None], None],
-        dtype=ArrowDtype(pa.list_(pa.int64())),
+        dtype=ListDtype(pa.int64()),
     )
     with pytest.raises(TypeError, match="'ListAccessor' object is not iterable"):
         iter(ser.list)