|
| 1 | +from __future__ import annotations |
| 2 | + |
| 3 | +import pytest |
| 4 | +import pandas as pd |
| 5 | +import pandas._testing as tm |
| 6 | + |
| 7 | +from pandas.io.excel import ExcelWriter |
| 8 | + |
| 9 | + |
| 10 | +class TestAdjacentTables: |
| 11 | + """Tests for reading Excel files with adjacent tables.""" |
| 12 | + |
| 13 | + @pytest.mark.parametrize( |
| 14 | + "engine,read_ext", |
| 15 | + [ |
| 16 | + pytest.param("openpyxl", ".xlsx", marks=[pytest.mark.skip_if_no("openpyxl")]), |
| 17 | + pytest.param("xlsxwriter", ".xlsx", marks=[pytest.mark.skip_if_no("xlsxwriter")]), |
| 18 | + ], |
| 19 | + ) |
| 20 | + def test_excel_read_adjacent_tables_nrows(self, engine, read_ext, tmp_path): |
| 21 | + """ |
| 22 | + Test that nrows parameter correctly handles adjacent tables with and without blank rows. |
| 23 | +
|
| 24 | + GH-61123 |
| 25 | + """ |
| 26 | + # Create test files with tables with and without blank rows between them |
| 27 | + # File 1: Two tables with a blank row between |
| 28 | + file1 = tmp_path / f"test1{read_ext}" |
| 29 | + df_upper = pd.DataFrame({"A": [1, 2, 3], "B": [4, 5, 6]}) |
| 30 | + df_lower = pd.DataFrame({"A": [7, 8, 9], "B": [10, 11, 12]}) |
| 31 | + |
| 32 | + with ExcelWriter(file1, engine=engine) as writer: |
| 33 | + df_upper.to_excel(writer, sheet_name="Sheet1", index=False) |
| 34 | + # Add blank row by starting lower table at row 5 (0-based index + header) |
| 35 | + df_lower.to_excel(writer, sheet_name="Sheet1", startrow=5, index=False) |
| 36 | + |
| 37 | + # File 2: Two tables with no blank row |
| 38 | + file2 = tmp_path / f"test2{read_ext}" |
| 39 | + with ExcelWriter(file2, engine=engine) as writer: |
| 40 | + df_upper.to_excel(writer, sheet_name="Sheet1", index=False) |
| 41 | + # No blank row, lower table starts right after (row 4 = header of second table) |
| 42 | + df_lower.to_excel(writer, sheet_name="Sheet1", startrow=4, index=False) |
| 43 | + |
| 44 | + # Read first 3 rows (header + 3 data rows) |
| 45 | + # Using nrows=3 to get exactly the upper table without blank rows |
| 46 | + df1 = pd.read_excel(file1, header=0, nrows=3, engine=engine) |
| 47 | + df2 = pd.read_excel(file2, header=0, nrows=3, engine=engine) |
| 48 | + |
| 49 | + # Expected data - just the upper table |
| 50 | + expected = pd.DataFrame({"A": [1, 2, 3], "B": [4, 5, 6]}) |
| 51 | + |
| 52 | + # Check content |
| 53 | + tm.assert_frame_equal(df1, expected) |
| 54 | + tm.assert_frame_equal(df2, expected) |
| 55 | + |
| 56 | + # Verify we didn't read the header of the next table in df2 |
| 57 | + # If we did, the last row would contain column headers from the second table |
| 58 | + assert df1.shape == (3, 2), f"Expected (3, 2) but got {df1.shape}" |
| 59 | + assert df2.shape == (3, 2), f"Expected (3, 2) but got {df2.shape}" |
| 60 | + |
| 61 | + # Fix the comparison warning by checking string values properly |
| 62 | + last_row_values = [str(x) for x in df2.iloc[-1].values] |
| 63 | + assert "A" not in last_row_values, "Second table header was incorrectly included" |
| 64 | + assert "B" not in last_row_values, "Second table header was incorrectly included" |
0 commit comments