Skip to content
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 3 additions & 0 deletions doc/source/whatsnew/v3.1.0.rst
Original file line number Diff line number Diff line change
Expand Up @@ -346,9 +346,12 @@ MultiIndex
I/O
^^^
- :func:`read_csv` with ``memory_map=True`` and an in-memory buffer (e.g. ``BytesIO``) now raises a clear ``ValueError`` instead of a cryptic ``UnsupportedOperation: fileno`` (:issue:`45630`)
- Fixed bug in :func:`read_csv` with ``engine="pyarrow"`` raising ``AttributeError`` when passing a non-dict ``dtype`` together with ``index_col`` (:issue:`65859`)
- Fixed bug in :func:`read_csv` with ``engine="pyarrow"`` where a ``defaultdict`` passed as ``dtype`` did not apply its default to columns not explicitly listed (:issue:`65859`)
- Fixed bug in :func:`read_csv` with the ``c`` engine where an embedded ``\r`` followed by a space in an unquoted field could cause an infinite re-parsing loop, producing spurious rows or a buffer overflow (:issue:`51141`)
- Fixed bug in :func:`read_excel` where usage of ``skiprows`` could lead to an infinite loop (:issue:`64027`)
- Fixed bug where :func:`read_html` parsed nested tables incorrectly when using ``html5lib`` or ``bs4`` flavors (:issue:`64524`)
- Fixed bugs in :func:`read_csv` with ``engine="pyarrow"`` where column names were handled inconsistently with the other engines: duplicated names were not de-duplicated to ``"x.1"``-style names, empty header fields did not get ``"Unnamed: {i}"`` placeholder names, and an unnamed ``index_col`` produced an index named ``""`` instead of an unnamed index (:issue:`65859`)
- Fixed :func:`read_json` with ``lines=True`` and ``chunksize`` to respect ``nrows``
when the requested row count is not a multiple of the chunk size (:issue:`64025`)
- :meth:`HDFStore.put` and :meth:`HDFStore.append` now support storing :class:`Series` and :class:`DataFrame` columns with :class:`PeriodDtype` in both ``"fixed"`` and ``"table"`` formats (:issue:`41978`)
Expand Down
74 changes: 73 additions & 1 deletion pandas/io/parsers/arrow_parser_wrapper.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
from __future__ import annotations

from collections import defaultdict
from typing import TYPE_CHECKING
import warnings

Expand Down Expand Up @@ -169,6 +170,55 @@ def _get_convert_options(self):

return convert_options

def _dedup_column_names(self, raw_names: list[str]) -> list[str]:
"""
Match other engines' header handling: empty names become
"Unnamed: {i}" and duplicated names are mangled with ".{count}",
mirroring the algorithm in pandas._libs.parsers.TextReader.

Sets ``self.unnamed_cols`` as a side effect.
"""
names = []
unnamed_col_indices = []
for i, name in enumerate(raw_names):
if name == "":
name = f"Unnamed: {i}"
unnamed_col_indices.append(i)
names.append(name)

# Ensure that regular columns are used before unnamed ones
# to keep given names and mangle unnamed columns
col_loop_order = [
i for i in range(len(names)) if i not in unnamed_col_indices
] + unnamed_col_indices
counts: dict[str, int] = {}

for i in col_loop_order:
col = old_col = names[i]
cur_count = counts.get(col, 0)

if cur_count > 0:
while cur_count > 0:
counts[old_col] = cur_count + 1
col = f"{old_col}.{cur_count}"
if col in names:
cur_count += 1
else:
cur_count = counts.get(col, 0)

if (
isinstance(self.dtype, dict)
and self.dtype.get(old_col) is not None
and self.dtype.get(col) is None
):
self.dtype[col] = self.dtype[old_col]

names[i] = col
counts[col] = cur_count + 1

self.unnamed_cols = {names[i] for i in unnamed_col_indices}
return names

def _adjust_column_names(self, table: pa.Table) -> bool:
num_cols = len(table.columns)
multi_index_named = True
Expand Down Expand Up @@ -196,7 +246,8 @@ def _finalize_index(self, frame: DataFrame, multi_index_named: bool) -> DataFram
raise ValueError(f"Index {item} invalid")

# Process dtype for index_col and drop from dtypes
if self.dtype is not None:
# (non-dict dtype is applied to the whole frame later)
if isinstance(self.dtype, dict):
key, new_dtype = (
(item, self.dtype.get(item))
if self.dtype.get(item) is not None
Expand All @@ -210,6 +261,13 @@ def _finalize_index(self, frame: DataFrame, multi_index_named: bool) -> DataFram
# Clear names if headerless and no name given
if self.header is None and not multi_index_named:
frame.index.names = [None] * len(frame.index.names)
elif self.unnamed_cols:
# match other engines: empty header fields used as index
# produce an unnamed index level
frame.index.names = [
None if name in self.unnamed_cols else name
for name in frame.index.names
]

return frame

Expand Down Expand Up @@ -306,8 +364,22 @@ def read(self) -> DataFrame:

table = table.cast(new_schema)

if self.header is not None and self.names is None:
new_names = self._dedup_column_names(table.column_names)
if new_names != table.column_names:
table = table.rename_columns(new_names)

multi_index_named = self._adjust_column_names(table)

if isinstance(self.dtype, defaultdict):
if self.header is None:
# set by _adjust_column_names above
assert self.names is not None
columns = list(self.names)
else:
columns = table.column_names
self.dtype = {col: self.dtype[col] for col in columns}

with warnings.catch_warnings():
warnings.filterwarnings(
"ignore",
Expand Down
1 change: 0 additions & 1 deletion pandas/tests/io/parser/common/test_common_basic.py
Original file line number Diff line number Diff line change
Expand Up @@ -126,7 +126,6 @@ def test_1000_sep_not_stripped_after_whitespace(all_parsers, value):
tm.assert_frame_equal(result, expected)


@xfail_pyarrow # ValueError: Found non-unique column index
def test_unnamed_columns(all_parsers):
data = """A,B,C,,
1,2,3,4,5
Expand Down
1 change: 0 additions & 1 deletion pandas/tests/io/parser/common/test_file_buffer_url.py
Original file line number Diff line number Diff line change
Expand Up @@ -69,7 +69,6 @@ def test_local_file(all_parsers, csv_dir_path):
pytest.skip("Failing on: " + " ".join(platform.uname()))


@xfail_pyarrow # AssertionError: DataFrame.index are different
def test_path_path_lib(all_parsers, temp_file):
parser = all_parsers
df = DataFrame(
Expand Down
1 change: 0 additions & 1 deletion pandas/tests/io/parser/common/test_index.py
Original file line number Diff line number Diff line change
Expand Up @@ -169,7 +169,6 @@ def test_multi_index_blank_df(all_parsers, data, columns, header, round_trip):
tm.assert_frame_equal(result, expected)


@xfail_pyarrow # AssertionError: DataFrame.columns are different
def test_no_unnamed_index(all_parsers):
parser = all_parsers
data = """ id c0 c1 c2
Expand Down
4 changes: 0 additions & 4 deletions pandas/tests/io/parser/common/test_inf.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,10 +14,7 @@
"ignore:Passing a BlockManager to DataFrame:DeprecationWarning"
)

xfail_pyarrow = pytest.mark.usefixtures("pyarrow_xfail")


@xfail_pyarrow # AssertionError: DataFrame.index are different
@pytest.mark.parametrize("na_filter", [True, False])
def test_inf_parsing(all_parsers, na_filter):
parser = all_parsers
Expand All @@ -41,7 +38,6 @@ def test_inf_parsing(all_parsers, na_filter):
tm.assert_frame_equal(result, expected)


@xfail_pyarrow # AssertionError: DataFrame.index are different
@pytest.mark.parametrize("na_filter", [True, False])
def test_infinity_parsing(all_parsers, na_filter):
parser = all_parsers
Expand Down
17 changes: 10 additions & 7 deletions pandas/tests/io/parser/dtypes/test_dtypes_basic.py
Original file line number Diff line number Diff line change
Expand Up @@ -31,12 +31,20 @@

@pytest.mark.parametrize("dtype", [str, object])
@pytest.mark.parametrize("check_orig", [True, False])
@pytest.mark.usefixtures("pyarrow_xfail")
def test_dtype_all_columns(
all_parsers, dtype, check_orig, using_infer_string, temp_file
all_parsers, dtype, check_orig, using_infer_string, temp_file, request
):
# see gh-3795, gh-6607
parser = all_parsers
if parser.engine == "pyarrow" and dtype is object:
if not check_orig:
# parsed values are not converted to strings
mark = pytest.mark.xfail(reason="float values instead of str")
request.applymarker(mark)
elif using_infer_string:
# dtype=object is also applied to the index column
mark = pytest.mark.xfail(reason="object index instead of str")
request.applymarker(mark)

df = DataFrame(
np.random.default_rng(2).random((5, 2)).round(4),
Expand Down Expand Up @@ -350,7 +358,6 @@ def test_true_values_cast_to_bool(all_parsers):
tm.assert_frame_equal(result, expected)


@pytest.mark.usefixtures("pyarrow_xfail")
@pytest.mark.parametrize("dtypes, exp_value", [({}, "1"), ({"a.1": "int64"}, 1)])
def test_dtype_mangle_dup_cols(all_parsers, dtypes, exp_value):
# GH#35211
Expand All @@ -365,7 +372,6 @@ def test_dtype_mangle_dup_cols(all_parsers, dtypes, exp_value):
tm.assert_frame_equal(result, expected)


@pytest.mark.usefixtures("pyarrow_xfail")
def test_dtype_mangle_dup_cols_single_dtype(all_parsers):
# GH#42022
parser = all_parsers
Expand Down Expand Up @@ -422,7 +428,6 @@ def test_nullable_int_dtype(all_parsers, any_int_ea_dtype):
tm.assert_frame_equal(actual, expected)


@pytest.mark.usefixtures("pyarrow_xfail")
@pytest.mark.parametrize("default", ["float", "float64"])
def test_dtypes_defaultdict(all_parsers, default):
# GH#41574
Expand All @@ -436,7 +441,6 @@ def test_dtypes_defaultdict(all_parsers, default):
tm.assert_frame_equal(result, expected)


@pytest.mark.usefixtures("pyarrow_xfail")
def test_dtypes_defaultdict_mangle_dup_cols(all_parsers):
# GH#41574
data = """a,b,a,b,b.1
Expand All @@ -450,7 +454,6 @@ def test_dtypes_defaultdict_mangle_dup_cols(all_parsers):
tm.assert_frame_equal(result, expected)


@pytest.mark.usefixtures("pyarrow_xfail")
def test_dtypes_defaultdict_invalid(all_parsers):
# GH#41574
data = """a,b
Expand Down
8 changes: 1 addition & 7 deletions pandas/tests/io/parser/test_index_col.py
Original file line number Diff line number Diff line change
Expand Up @@ -166,13 +166,9 @@ def test_empty_with_index_col_false(all_parsers):
["NotReallyUnnamed", "Unnamed: 0"],
],
)
def test_multi_index_naming(all_parsers, index_names, request):
def test_multi_index_naming(all_parsers, index_names):
parser = all_parsers

if parser.engine == "pyarrow" and "" in index_names:
mark = pytest.mark.xfail(reason="One case raises, others are wrong")
request.applymarker(mark)

# We don't want empty index names being replaced with "Unnamed: 0"
data = ",".join([*index_names, "col\na,c,1\na,d,2\nb,c,3\nb,d,4"])
result = parser.read_csv(StringIO(data), index_col=[0, 1])
Expand All @@ -184,7 +180,6 @@ def test_multi_index_naming(all_parsers, index_names, request):
tm.assert_frame_equal(result, expected)


@xfail_pyarrow # ValueError: Found non-unique column index
def test_multi_index_naming_not_all_at_beginning(all_parsers):
parser = all_parsers
data = ",Unnamed: 2,\na,c,1\na,d,2\nb,c,3\nb,d,4"
Expand All @@ -199,7 +194,6 @@ def test_multi_index_naming_not_all_at_beginning(all_parsers):
tm.assert_frame_equal(result, expected)


@xfail_pyarrow # ValueError: Found non-unique column index
def test_no_multi_index_level_names_empty(temp_file, all_parsers):
# GH 10984
parser = all_parsers
Expand Down
9 changes: 0 additions & 9 deletions pandas/tests/io/parser/test_mangle_dupes.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,15 +14,11 @@
)
import pandas._testing as tm

xfail_pyarrow = pytest.mark.usefixtures("pyarrow_xfail")


pytestmark = pytest.mark.filterwarnings(
"ignore:Passing a BlockManager to DataFrame:DeprecationWarning"
)


@xfail_pyarrow # ValueError: Found non-unique column index
def test_basic(all_parsers):
parser = all_parsers

Expand All @@ -33,7 +29,6 @@ def test_basic(all_parsers):
tm.assert_frame_equal(result, expected)


@xfail_pyarrow # ValueError: Found non-unique column index
def test_basic_names(all_parsers):
# See gh-7160
parser = all_parsers
Expand All @@ -54,7 +49,6 @@ def test_basic_names_raise(all_parsers):
parser.read_csv(StringIO(data), names=["a", "b", "a"])


@xfail_pyarrow # ValueError: Found non-unique column index
@pytest.mark.parametrize(
"data,expected",
[
Expand Down Expand Up @@ -122,7 +116,6 @@ def test_thorough_mangle_names(all_parsers, data, names, expected):
parser.read_csv(StringIO(data), names=names)


@xfail_pyarrow # AssertionError: DataFrame.columns are different
def test_mangled_unnamed_placeholders(all_parsers):
# xref gh-13017
orig_key = "0"
Expand All @@ -145,7 +138,6 @@ def test_mangled_unnamed_placeholders(all_parsers):
tm.assert_frame_equal(df, expected)


@xfail_pyarrow # ValueError: Found non-unique column index
def test_mangle_dupe_cols_already_exists(all_parsers):
# GH#14704
parser = all_parsers
Expand All @@ -159,7 +151,6 @@ def test_mangle_dupe_cols_already_exists(all_parsers):
tm.assert_frame_equal(result, expected)


@xfail_pyarrow # ValueError: Found non-unique column index
def test_mangle_dupe_cols_already_exists_unnamed_col(all_parsers):
# GH#14704
parser = all_parsers
Expand Down
1 change: 0 additions & 1 deletion pandas/tests/io/parser/test_parse_dates.py
Original file line number Diff line number Diff line change
Expand Up @@ -85,7 +85,6 @@ def test_date_col_as_index_col(all_parsers):
tm.assert_frame_equal(result, expected)


@xfail_pyarrow
def test_nat_parse(all_parsers, temp_file):
# see gh-3062
parser = all_parsers
Expand Down
Loading