pandas-dev · jbrockmendel · Jun 11, 2026 · Jun 11, 2026 · Jun 11, 2026
diff --git a/doc/source/whatsnew/v3.1.0.rst b/doc/source/whatsnew/v3.1.0.rst
@@ -346,9 +346,12 @@ MultiIndex
 I/O
 ^^^
 - :func:`read_csv` with ``memory_map=True`` and an in-memory buffer (e.g. ``BytesIO``) now raises a clear ``ValueError`` instead of a cryptic ``UnsupportedOperation: fileno`` (:issue:`45630`)
+- Fixed bug in :func:`read_csv` with ``engine="pyarrow"`` raising ``AttributeError`` when passing a non-dict ``dtype`` together with ``index_col`` (:issue:`65859`)
+- Fixed bug in :func:`read_csv` with ``engine="pyarrow"`` where a ``defaultdict`` passed as ``dtype`` did not apply its default to columns not explicitly listed (:issue:`65859`)
 - Fixed bug in :func:`read_csv` with the ``c`` engine where an embedded ``\r`` followed by a space in an unquoted field could cause an infinite re-parsing loop, producing spurious rows or a buffer overflow (:issue:`51141`)
 - Fixed bug in :func:`read_excel` where usage of ``skiprows`` could lead to an infinite loop (:issue:`64027`)
 - Fixed bug where :func:`read_html` parsed nested tables incorrectly when using ``html5lib`` or ``bs4`` flavors (:issue:`64524`)
+- Fixed bugs in :func:`read_csv` with ``engine="pyarrow"`` where column names were handled inconsistently with the other engines: duplicated names were not de-duplicated to ``"x.1"``-style names, empty header fields did not get ``"Unnamed: {i}"`` placeholder names, and an unnamed ``index_col`` produced an index named ``""`` instead of an unnamed index (:issue:`65859`)
 - Fixed :func:`read_json` with ``lines=True`` and ``chunksize`` to respect ``nrows``
   when the requested row count is not a multiple of the chunk size (:issue:`64025`)
 - :meth:`HDFStore.put` and :meth:`HDFStore.append` now support storing :class:`Series` and :class:`DataFrame` columns with :class:`PeriodDtype` in both ``"fixed"`` and ``"table"`` formats (:issue:`41978`)

diff --git a/pandas/io/parsers/arrow_parser_wrapper.py b/pandas/io/parsers/arrow_parser_wrapper.py
@@ -1,5 +1,6 @@
 from __future__ import annotations
 
+from collections import defaultdict
 from typing import TYPE_CHECKING
 import warnings
 
@@ -169,6 +170,55 @@ def _get_convert_options(self):
 
         return convert_options
 
+    def _dedup_column_names(self, raw_names: list[str]) -> list[str]:
+        """
+        Match other engines' header handling: empty names become
+        "Unnamed: {i}" and duplicated names are mangled with ".{count}",
+        mirroring the algorithm in pandas._libs.parsers.TextReader.
+
+        Sets ``self.unnamed_cols`` as a side effect.
+        """
+        names = []
+        unnamed_col_indices = []
+        for i, name in enumerate(raw_names):
+            if name == "":
+                name = f"Unnamed: {i}"
+                unnamed_col_indices.append(i)
+            names.append(name)
+
+        # Ensure that regular columns are used before unnamed ones
+        # to keep given names and mangle unnamed columns
+        col_loop_order = [
+            i for i in range(len(names)) if i not in unnamed_col_indices
+        ] + unnamed_col_indices
+        counts: dict[str, int] = {}
+
+        for i in col_loop_order:
+            col = old_col = names[i]
+            cur_count = counts.get(col, 0)
+
+            if cur_count > 0:
+                while cur_count > 0:
+                    counts[old_col] = cur_count + 1
+                    col = f"{old_col}.{cur_count}"
+                    if col in names:
+                        cur_count += 1
+                    else:
+                        cur_count = counts.get(col, 0)
+
+                if (
+                    isinstance(self.dtype, dict)
+                    and self.dtype.get(old_col) is not None
+                    and self.dtype.get(col) is None
+                ):
+                    self.dtype[col] = self.dtype[old_col]
+
+            names[i] = col
+            counts[col] = cur_count + 1
+
+        self.unnamed_cols = {names[i] for i in unnamed_col_indices}
+        return names
+
     def _adjust_column_names(self, table: pa.Table) -> bool:
         num_cols = len(table.columns)
         multi_index_named = True
@@ -196,7 +246,8 @@ def _finalize_index(self, frame: DataFrame, multi_index_named: bool) -> DataFram
                     raise ValueError(f"Index {item} invalid")
 
                 # Process dtype for index_col and drop from dtypes
-                if self.dtype is not None:
+                # (non-dict dtype is applied to the whole frame later)
+                if isinstance(self.dtype, dict):
                     key, new_dtype = (
                         (item, self.dtype.get(item))
                         if self.dtype.get(item) is not None
@@ -210,6 +261,13 @@ def _finalize_index(self, frame: DataFrame, multi_index_named: bool) -> DataFram
             # Clear names if headerless and no name given
             if self.header is None and not multi_index_named:
                 frame.index.names = [None] * len(frame.index.names)
+            elif self.unnamed_cols:
+                # match other engines: empty header fields used as index
+                # produce an unnamed index level
+                frame.index.names = [
+                    None if name in self.unnamed_cols else name
+                    for name in frame.index.names
+                ]
 
         return frame
 
@@ -306,8 +364,22 @@ def read(self) -> DataFrame:
 
             table = table.cast(new_schema)
 
+        if self.header is not None and self.names is None:
+            new_names = self._dedup_column_names(table.column_names)
+            if new_names != table.column_names:
+                table = table.rename_columns(new_names)
+
         multi_index_named = self._adjust_column_names(table)
 
+        if isinstance(self.dtype, defaultdict):
+            if self.header is None:
+                # set by _adjust_column_names above
+                assert self.names is not None
+                columns = list(self.names)
+            else:
+                columns = table.column_names
+            self.dtype = {col: self.dtype[col] for col in columns}
+
         with warnings.catch_warnings():
             warnings.filterwarnings(
                 "ignore",

diff --git a/pandas/tests/io/parser/common/test_common_basic.py b/pandas/tests/io/parser/common/test_common_basic.py
@@ -126,7 +126,6 @@ def test_1000_sep_not_stripped_after_whitespace(all_parsers, value):
     tm.assert_frame_equal(result, expected)
 
 
-@xfail_pyarrow  # ValueError: Found non-unique column index
 def test_unnamed_columns(all_parsers):
     data = """A,B,C,,
 1,2,3,4,5

diff --git a/pandas/tests/io/parser/common/test_file_buffer_url.py b/pandas/tests/io/parser/common/test_file_buffer_url.py
@@ -69,7 +69,6 @@ def test_local_file(all_parsers, csv_dir_path):
         pytest.skip("Failing on: " + " ".join(platform.uname()))
 
 
-@xfail_pyarrow  # AssertionError: DataFrame.index are different
 def test_path_path_lib(all_parsers, temp_file):
     parser = all_parsers
     df = DataFrame(

diff --git a/pandas/tests/io/parser/common/test_index.py b/pandas/tests/io/parser/common/test_index.py
@@ -169,7 +169,6 @@ def test_multi_index_blank_df(all_parsers, data, columns, header, round_trip):
     tm.assert_frame_equal(result, expected)
 
 
-@xfail_pyarrow  # AssertionError: DataFrame.columns are different
 def test_no_unnamed_index(all_parsers):
     parser = all_parsers
     data = """ id c0 c1 c2

diff --git a/pandas/tests/io/parser/common/test_inf.py b/pandas/tests/io/parser/common/test_inf.py
@@ -14,10 +14,7 @@
     "ignore:Passing a BlockManager to DataFrame:DeprecationWarning"
 )
 
-xfail_pyarrow = pytest.mark.usefixtures("pyarrow_xfail")
 
-
-@xfail_pyarrow  # AssertionError: DataFrame.index are different
 @pytest.mark.parametrize("na_filter", [True, False])
 def test_inf_parsing(all_parsers, na_filter):
     parser = all_parsers
@@ -41,7 +38,6 @@ def test_inf_parsing(all_parsers, na_filter):
     tm.assert_frame_equal(result, expected)
 
 
-@xfail_pyarrow  # AssertionError: DataFrame.index are different
 @pytest.mark.parametrize("na_filter", [True, False])
 def test_infinity_parsing(all_parsers, na_filter):
     parser = all_parsers

diff --git a/pandas/tests/io/parser/dtypes/test_dtypes_basic.py b/pandas/tests/io/parser/dtypes/test_dtypes_basic.py
@@ -31,12 +31,20 @@
 
 @pytest.mark.parametrize("dtype", [str, object])
 @pytest.mark.parametrize("check_orig", [True, False])
-@pytest.mark.usefixtures("pyarrow_xfail")
 def test_dtype_all_columns(
-    all_parsers, dtype, check_orig, using_infer_string, temp_file
+    all_parsers, dtype, check_orig, using_infer_string, temp_file, request
 ):
     # see gh-3795, gh-6607
     parser = all_parsers
+    if parser.engine == "pyarrow" and dtype is object:
+        if not check_orig:
+            # parsed values are not converted to strings
+            mark = pytest.mark.xfail(reason="float values instead of str")
+            request.applymarker(mark)
+        elif using_infer_string:
+            # dtype=object is also applied to the index column
+            mark = pytest.mark.xfail(reason="object index instead of str")
+            request.applymarker(mark)
 
     df = DataFrame(
         np.random.default_rng(2).random((5, 2)).round(4),
@@ -350,7 +358,6 @@ def test_true_values_cast_to_bool(all_parsers):
     tm.assert_frame_equal(result, expected)
 
 
-@pytest.mark.usefixtures("pyarrow_xfail")
 @pytest.mark.parametrize("dtypes, exp_value", [({}, "1"), ({"a.1": "int64"}, 1)])
 def test_dtype_mangle_dup_cols(all_parsers, dtypes, exp_value):
     # GH#35211
@@ -365,7 +372,6 @@ def test_dtype_mangle_dup_cols(all_parsers, dtypes, exp_value):
     tm.assert_frame_equal(result, expected)
 
 
-@pytest.mark.usefixtures("pyarrow_xfail")
 def test_dtype_mangle_dup_cols_single_dtype(all_parsers):
     # GH#42022
     parser = all_parsers
@@ -422,7 +428,6 @@ def test_nullable_int_dtype(all_parsers, any_int_ea_dtype):
     tm.assert_frame_equal(actual, expected)
 
 
-@pytest.mark.usefixtures("pyarrow_xfail")
 @pytest.mark.parametrize("default", ["float", "float64"])
 def test_dtypes_defaultdict(all_parsers, default):
     # GH#41574
@@ -436,7 +441,6 @@ def test_dtypes_defaultdict(all_parsers, default):
     tm.assert_frame_equal(result, expected)
 
 
-@pytest.mark.usefixtures("pyarrow_xfail")
 def test_dtypes_defaultdict_mangle_dup_cols(all_parsers):
     # GH#41574
     data = """a,b,a,b,b.1
@@ -450,7 +454,6 @@ def test_dtypes_defaultdict_mangle_dup_cols(all_parsers):
     tm.assert_frame_equal(result, expected)
 
 
-@pytest.mark.usefixtures("pyarrow_xfail")
 def test_dtypes_defaultdict_invalid(all_parsers):
     # GH#41574
     data = """a,b

diff --git a/pandas/tests/io/parser/test_index_col.py b/pandas/tests/io/parser/test_index_col.py
@@ -166,13 +166,9 @@ def test_empty_with_index_col_false(all_parsers):
         ["NotReallyUnnamed", "Unnamed: 0"],
     ],
 )
-def test_multi_index_naming(all_parsers, index_names, request):
+def test_multi_index_naming(all_parsers, index_names):
     parser = all_parsers
 
-    if parser.engine == "pyarrow" and "" in index_names:
-        mark = pytest.mark.xfail(reason="One case raises, others are wrong")
-        request.applymarker(mark)
-
     # We don't want empty index names being replaced with "Unnamed: 0"
     data = ",".join([*index_names, "col\na,c,1\na,d,2\nb,c,3\nb,d,4"])
     result = parser.read_csv(StringIO(data), index_col=[0, 1])
@@ -184,7 +180,6 @@ def test_multi_index_naming(all_parsers, index_names, request):
     tm.assert_frame_equal(result, expected)
 
 
-@xfail_pyarrow  # ValueError: Found non-unique column index
 def test_multi_index_naming_not_all_at_beginning(all_parsers):
     parser = all_parsers
     data = ",Unnamed: 2,\na,c,1\na,d,2\nb,c,3\nb,d,4"
@@ -199,7 +194,6 @@ def test_multi_index_naming_not_all_at_beginning(all_parsers):
     tm.assert_frame_equal(result, expected)
 
 
-@xfail_pyarrow  # ValueError: Found non-unique column index
 def test_no_multi_index_level_names_empty(temp_file, all_parsers):
     # GH 10984
     parser = all_parsers

diff --git a/pandas/tests/io/parser/test_mangle_dupes.py b/pandas/tests/io/parser/test_mangle_dupes.py
@@ -14,15 +14,11 @@
 )
 import pandas._testing as tm
 
-xfail_pyarrow = pytest.mark.usefixtures("pyarrow_xfail")
-
-
 pytestmark = pytest.mark.filterwarnings(
     "ignore:Passing a BlockManager to DataFrame:DeprecationWarning"
 )
 
 
-@xfail_pyarrow  # ValueError: Found non-unique column index
 def test_basic(all_parsers):
     parser = all_parsers
 
@@ -33,7 +29,6 @@ def test_basic(all_parsers):
     tm.assert_frame_equal(result, expected)
 
 
-@xfail_pyarrow  # ValueError: Found non-unique column index
 def test_basic_names(all_parsers):
     # See gh-7160
     parser = all_parsers
@@ -54,7 +49,6 @@ def test_basic_names_raise(all_parsers):
         parser.read_csv(StringIO(data), names=["a", "b", "a"])
 
 
-@xfail_pyarrow  # ValueError: Found non-unique column index
 @pytest.mark.parametrize(
     "data,expected",
     [
@@ -122,7 +116,6 @@ def test_thorough_mangle_names(all_parsers, data, names, expected):
         parser.read_csv(StringIO(data), names=names)
 
 
-@xfail_pyarrow  # AssertionError: DataFrame.columns are different
 def test_mangled_unnamed_placeholders(all_parsers):
     # xref gh-13017
     orig_key = "0"
@@ -145,7 +138,6 @@ def test_mangled_unnamed_placeholders(all_parsers):
         tm.assert_frame_equal(df, expected)
 
 
-@xfail_pyarrow  # ValueError: Found non-unique column index
 def test_mangle_dupe_cols_already_exists(all_parsers):
     # GH#14704
     parser = all_parsers
@@ -159,7 +151,6 @@ def test_mangle_dupe_cols_already_exists(all_parsers):
     tm.assert_frame_equal(result, expected)
 
 
-@xfail_pyarrow  # ValueError: Found non-unique column index
 def test_mangle_dupe_cols_already_exists_unnamed_col(all_parsers):
     # GH#14704
     parser = all_parsers

diff --git a/pandas/tests/io/parser/test_parse_dates.py b/pandas/tests/io/parser/test_parse_dates.py
@@ -85,7 +85,6 @@ def test_date_col_as_index_col(all_parsers):
     tm.assert_frame_equal(result, expected)
 
 
-@xfail_pyarrow
 def test_nat_parse(all_parsers, temp_file):
     # see gh-3062
     parser = all_parsers