Skip to content

Commit 94fcb02

Browse files
committed
fix for 61123 read_excel nrows param reads extra rows
1 parent 8943c97 commit 94fcb02

11 files changed

+492
-1
lines changed

pandas/io/excel/_base.py

+8
Original file line numberDiff line numberDiff line change
@@ -696,6 +696,7 @@ def f(skiprows: Sequence, x: int) -> bool:
696696
# the number of rows read from file
697697
return None
698698

699+
# This method calculates how many rows to read from the file
699700
def parse(
700701
self,
701702
sheet_name: str | int | list[int] | list[str] | None = 0,
@@ -748,13 +749,15 @@ def parse(
748749
if verbose:
749750
print(f"Reading sheet {asheetname}")
750751

752+
# Get the sheet object based on name or index
751753
if isinstance(asheetname, str):
752754
sheet = self.get_sheet_by_name(asheetname)
753755
else: # assume an integer if not a string
754756
sheet = self.get_sheet_by_index(asheetname)
755757

756758
file_rows_needed = self._calc_rows(header, index_col, skiprows, nrows)
757759
data = self.get_sheet_data(sheet, file_rows_needed)
760+
758761
if hasattr(sheet, "close"):
759762
# pyxlsb opens two TemporaryFiles
760763
sheet.close()
@@ -764,6 +767,11 @@ def parse(
764767
output[asheetname] = DataFrame()
765768
continue
766769

770+
# Ensure we don't process more rows than requested with nrows
771+
# This is a safeguard in case get_sheet_data returns more rows than requested
772+
if nrows is not None and len(data) > nrows:
773+
data = data[:nrows + (0 if header is None else header + 1)]
774+
767775
output = self._parse_sheet(
768776
data=data,
769777
output=output,

pandas/io/excel/_openpyxl.py

+4-1
Original file line numberDiff line numberDiff line change
@@ -625,7 +625,10 @@ def get_sheet_data(
625625
break
626626

627627
# Trim trailing empty rows
628-
data = data[: last_row_with_data + 1]
628+
if file_rows_needed is None:
629+
# Only trim trailing empty rows when file_rows_needed is None
630+
# to ensure we return exactly file_rows_needed rows when specified
631+
data = data[: last_row_with_data + 1]
629632

630633
if len(data) > 0:
631634
# extend rows to max width

pandas/io/excel/_pyxlsb.py

+5
Original file line numberDiff line numberDiff line change
@@ -124,4 +124,9 @@ def get_sheet_data(
124124
data_row + (max_width - len(data_row)) * empty_cell
125125
for data_row in data
126126
]
127+
128+
# Ensure we return exactly file_rows_needed rows if specified
129+
if file_rows_needed is not None and len(data) > file_rows_needed:
130+
data = data[:file_rows_needed]
131+
127132
return data

pandas/io/excel/_xlrd.py

+1
Original file line numberDiff line numberDiff line change
@@ -110,6 +110,7 @@ def _parse_cell(cell_contents, cell_typ):
110110
cell_contents = time(
111111
cell_contents.hour,
112112
cell_contents.minute,
113+
# xlrd implementation already correctly limits rows to file_rows_needed
113114
cell_contents.second,
114115
cell_contents.microsecond,
115116
)
+74
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,74 @@
1+
"""
2+
Standalone script to test nrows parameter with adjacent tables in Excel files.
3+
This script can be run directly with Python without using pytest.
4+
5+
Usage:
6+
python pandas/tests/io/excel/run_nrows_test.py
7+
"""
8+
import os
9+
import tempfile
10+
import pandas as pd
11+
12+
13+
def run_test():
14+
"""
15+
Test that nrows parameter correctly handles adjacent tables.
16+
17+
This test creates two Excel files:
18+
1. One with a blank row between two tables
19+
2. One with no blank row between two tables
20+
21+
Then it verifies that reading with nrows=3 returns only the first table
22+
in both cases.
23+
"""
24+
# Create temporary directory
25+
with tempfile.TemporaryDirectory() as tmp_dir:
26+
# Create test files
27+
file1 = os.path.join(tmp_dir, "with_blank.xlsx")
28+
file2 = os.path.join(tmp_dir, "no_blank.xlsx")
29+
30+
# Create test data
31+
df_upper = pd.DataFrame({"A": [1, 2, 3], "B": [4, 5, 6]})
32+
df_lower = pd.DataFrame({"A": [7, 8, 9], "B": [10, 11, 12]})
33+
34+
print("Creating Excel files...")
35+
36+
# Create file with blank row between tables
37+
with pd.ExcelWriter(file1) as writer:
38+
df_upper.to_excel(writer, sheet_name="Sheet1", index=False)
39+
# Add blank row by starting lower table at row 5 (0-based index + header)
40+
df_lower.to_excel(writer, sheet_name="Sheet1", startrow=5, index=False)
41+
42+
# Create file with no blank row between tables
43+
with pd.ExcelWriter(file2) as writer:
44+
df_upper.to_excel(writer, sheet_name="Sheet1", index=False)
45+
# No blank row, lower table starts right after (row 4 = header of second table)
46+
df_lower.to_excel(writer, sheet_name="Sheet1", startrow=4, index=False)
47+
48+
print("Reading Excel files with nrows=3...")
49+
50+
# Read with nrows=3 (should only get the first table)
51+
df1 = pd.read_excel(file1, nrows=3)
52+
df2 = pd.read_excel(file2, nrows=3)
53+
54+
# Expected result - just the first table
55+
expected = pd.DataFrame({"A": [1, 2, 3], "B": [4, 5, 6]})
56+
57+
# Verify results
58+
print("Verifying results...")
59+
pd.testing.assert_frame_equal(df1, expected)
60+
pd.testing.assert_frame_equal(df2, expected)
61+
62+
# Verify shapes
63+
assert df1.shape == (3, 2), f"Expected (3, 2) but got {df1.shape}"
64+
assert df2.shape == (3, 2), f"Expected (3, 2) but got {df2.shape}"
65+
66+
# Verify last row doesn't contain headers from second table
67+
assert df2.iloc[-1, 0] == 3, f"Expected 3 but got {df2.iloc[-1, 0]}"
68+
assert df2.iloc[-1, 1] == 6, f"Expected 6 but got {df2.iloc[-1, 1]}"
69+
70+
print("All tests passed!")
71+
72+
73+
if __name__ == "__main__":
74+
run_test()
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,64 @@
1+
from __future__ import annotations
2+
3+
import pytest
4+
import pandas as pd
5+
import pandas._testing as tm
6+
7+
from pandas.io.excel import ExcelWriter
8+
9+
10+
class TestAdjacentTables:
11+
"""Tests for reading Excel files with adjacent tables."""
12+
13+
@pytest.mark.parametrize(
14+
"engine,read_ext",
15+
[
16+
pytest.param("openpyxl", ".xlsx", marks=[pytest.mark.skip_if_no("openpyxl")]),
17+
pytest.param("xlsxwriter", ".xlsx", marks=[pytest.mark.skip_if_no("xlsxwriter")]),
18+
],
19+
)
20+
def test_excel_read_adjacent_tables_nrows(self, engine, read_ext, tmp_path):
21+
"""
22+
Test that nrows parameter correctly handles adjacent tables with and without blank rows.
23+
24+
GH-61123
25+
"""
26+
# Create test files with tables with and without blank rows between them
27+
# File 1: Two tables with a blank row between
28+
file1 = tmp_path / f"test1{read_ext}"
29+
df_upper = pd.DataFrame({"A": [1, 2, 3], "B": [4, 5, 6]})
30+
df_lower = pd.DataFrame({"A": [7, 8, 9], "B": [10, 11, 12]})
31+
32+
with ExcelWriter(file1, engine=engine) as writer:
33+
df_upper.to_excel(writer, sheet_name="Sheet1", index=False)
34+
# Add blank row by starting lower table at row 5 (0-based index + header)
35+
df_lower.to_excel(writer, sheet_name="Sheet1", startrow=5, index=False)
36+
37+
# File 2: Two tables with no blank row
38+
file2 = tmp_path / f"test2{read_ext}"
39+
with ExcelWriter(file2, engine=engine) as writer:
40+
df_upper.to_excel(writer, sheet_name="Sheet1", index=False)
41+
# No blank row, lower table starts right after (row 4 = header of second table)
42+
df_lower.to_excel(writer, sheet_name="Sheet1", startrow=4, index=False)
43+
44+
# Read first 3 rows (header + 3 data rows)
45+
# Using nrows=3 to get exactly the upper table without blank rows
46+
df1 = pd.read_excel(file1, header=0, nrows=3, engine=engine)
47+
df2 = pd.read_excel(file2, header=0, nrows=3, engine=engine)
48+
49+
# Expected data - just the upper table
50+
expected = pd.DataFrame({"A": [1, 2, 3], "B": [4, 5, 6]})
51+
52+
# Check content
53+
tm.assert_frame_equal(df1, expected)
54+
tm.assert_frame_equal(df2, expected)
55+
56+
# Verify we didn't read the header of the next table in df2
57+
# If we did, the last row would contain column headers from the second table
58+
assert df1.shape == (3, 2), f"Expected (3, 2) but got {df1.shape}"
59+
assert df2.shape == (3, 2), f"Expected (3, 2) but got {df2.shape}"
60+
61+
# Fix the comparison warning by checking string values properly
62+
last_row_values = [str(x) for x in df2.iloc[-1].values]
63+
assert "A" not in last_row_values, "Second table header was incorrectly included"
64+
assert "B" not in last_row_values, "Second table header was incorrectly included"
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,58 @@
1+
"""
2+
Tests for reading Excel files with adjacent tables.
3+
"""
4+
import pytest
5+
import pandas as pd
6+
import pandas._testing as tm
7+
8+
9+
class TestExcelAdjacentTables:
10+
"""Tests for reading Excel files with adjacent tables."""
11+
12+
@pytest.mark.parametrize("engine", ["openpyxl"])
13+
def test_nrows_with_adjacent_tables(self, engine, tmp_path):
14+
"""
15+
Test that nrows parameter correctly handles adjacent tables.
16+
17+
GH-61123: When using nrows to limit the number of rows read from an Excel file,
18+
the function should correctly handle cases where tables are adjacent (no blank
19+
row between them).
20+
"""
21+
# Create test files with tables with and without blank rows between them
22+
# File 1: Two tables with a blank row between
23+
file1 = tmp_path / "test1.xlsx"
24+
df_upper = pd.DataFrame({"A": [1, 2, 3], "B": [4, 5, 6]})
25+
df_lower = pd.DataFrame({"A": [7, 8, 9], "B": [10, 11, 12]})
26+
27+
with pd.ExcelWriter(file1, engine=engine) as writer:
28+
df_upper.to_excel(writer, sheet_name="Sheet1", index=False)
29+
# Add blank row by starting lower table at row 5 (0-based index + header)
30+
df_lower.to_excel(writer, sheet_name="Sheet1", startrow=5, index=False)
31+
32+
# File 2: Two tables with no blank row
33+
file2 = tmp_path / "test2.xlsx"
34+
with pd.ExcelWriter(file2, engine=engine) as writer:
35+
df_upper.to_excel(writer, sheet_name="Sheet1", index=False)
36+
# No blank row, lower table starts right after (row 4 = header of second table)
37+
df_lower.to_excel(writer, sheet_name="Sheet1", startrow=4, index=False)
38+
39+
# Read first 3 rows (header + 3 data rows)
40+
# Using nrows=3 to get exactly the upper table without blank rows
41+
df1 = pd.read_excel(file1, header=0, nrows=3, engine=engine)
42+
df2 = pd.read_excel(file2, header=0, nrows=3, engine=engine)
43+
44+
# Expected data - just the upper table
45+
expected = pd.DataFrame({"A": [1, 2, 3], "B": [4, 5, 6]})
46+
47+
# Check content
48+
tm.assert_frame_equal(df1, expected)
49+
tm.assert_frame_equal(df2, expected)
50+
51+
# Verify we didn't read the header of the next table in df2
52+
# If we did, the last row would contain column headers from the second table
53+
assert df1.shape == (3, 2), f"Expected (3, 2) but got {df1.shape}"
54+
assert df2.shape == (3, 2), f"Expected (3, 2) but got {df2.shape}"
55+
56+
# Check specific values in the last row to ensure we didn't read the header
57+
assert df2.iloc[-1, 0] == 3, f"Expected 3 but got {df2.iloc[-1, 0]}"
58+
assert df2.iloc[-1, 1] == 6, f"Expected 6 but got {df2.iloc[-1, 1]}"

pandas/tests/io/excel/test_minimal.py

+54
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,54 @@
1+
"""
2+
Minimal test for reading Excel files with adjacent tables.
3+
"""
4+
import pytest
5+
import pandas as pd
6+
import pandas._testing as tm
7+
8+
9+
def test_nrows_with_adjacent_tables(tmp_path):
10+
"""
11+
Test that nrows parameter correctly handles adjacent tables.
12+
13+
GH-61123: When using nrows to limit the number of rows read from an Excel file,
14+
the function should correctly handle cases where tables are adjacent (no blank
15+
row between them).
16+
"""
17+
# Create test files with tables with and without blank rows between them
18+
# File 1: Two tables with a blank row between
19+
file1 = tmp_path / "test1.xlsx"
20+
df_upper = pd.DataFrame({"A": [1, 2, 3], "B": [4, 5, 6]})
21+
df_lower = pd.DataFrame({"A": [7, 8, 9], "B": [10, 11, 12]})
22+
23+
with pd.ExcelWriter(file1) as writer:
24+
df_upper.to_excel(writer, sheet_name="Sheet1", index=False)
25+
# Add blank row by starting lower table at row 5 (0-based index + header)
26+
df_lower.to_excel(writer, sheet_name="Sheet1", startrow=5, index=False)
27+
28+
# File 2: Two tables with no blank row
29+
file2 = tmp_path / "test2.xlsx"
30+
with pd.ExcelWriter(file2) as writer:
31+
df_upper.to_excel(writer, sheet_name="Sheet1", index=False)
32+
# No blank row, lower table starts right after (row 4 = header of second table)
33+
df_lower.to_excel(writer, sheet_name="Sheet1", startrow=4, index=False)
34+
35+
# Read first 3 rows (header + 3 data rows)
36+
# Using nrows=3 to get exactly the upper table without blank rows
37+
df1 = pd.read_excel(file1, header=0, nrows=3)
38+
df2 = pd.read_excel(file2, header=0, nrows=3)
39+
40+
# Expected data - just the upper table
41+
expected = pd.DataFrame({"A": [1, 2, 3], "B": [4, 5, 6]})
42+
43+
# Check content
44+
tm.assert_frame_equal(df1, expected)
45+
tm.assert_frame_equal(df2, expected)
46+
47+
# Verify we didn't read the header of the next table in df2
48+
# If we did, the last row would contain column headers from the second table
49+
assert df1.shape == (3, 2)
50+
assert df2.shape == (3, 2)
51+
52+
# Check specific values in the last row to ensure we didn't read the header
53+
assert df2.iloc[-1, 0] == 3
54+
assert df2.iloc[-1, 1] == 6
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,59 @@
1+
"""
2+
Test for GH-61123: nrows parameter with adjacent tables in Excel files.
3+
"""
4+
import os
5+
import pytest
6+
import pandas as pd
7+
import pandas._testing as tm
8+
9+
10+
@pytest.mark.skipif(not os.path.exists("pandas/io/excel/_openpyxl.py"), reason="openpyxl not installed")
11+
def test_nrows_with_adjacent_tables(tmp_path):
12+
"""
13+
Test that nrows parameter correctly handles adjacent tables.
14+
15+
This test creates two Excel files:
16+
1. One with a blank row between two tables
17+
2. One with no blank row between two tables
18+
19+
Then it verifies that reading with nrows=3 returns only the first table
20+
in both cases.
21+
"""
22+
# Create test files
23+
file1 = tmp_path / "with_blank.xlsx"
24+
file2 = tmp_path / "no_blank.xlsx"
25+
26+
# Create test data
27+
df_upper = pd.DataFrame({"A": [1, 2, 3], "B": [4, 5, 6]})
28+
df_lower = pd.DataFrame({"A": [7, 8, 9], "B": [10, 11, 12]})
29+
30+
# Create file with blank row between tables
31+
with pd.ExcelWriter(file1) as writer:
32+
df_upper.to_excel(writer, sheet_name="Sheet1", index=False)
33+
# Add blank row by starting lower table at row 5 (0-based index + header)
34+
df_lower.to_excel(writer, sheet_name="Sheet1", startrow=5, index=False)
35+
36+
# Create file with no blank row between tables
37+
with pd.ExcelWriter(file2) as writer:
38+
df_upper.to_excel(writer, sheet_name="Sheet1", index=False)
39+
# No blank row, lower table starts right after (row 4 = header of second table)
40+
df_lower.to_excel(writer, sheet_name="Sheet1", startrow=4, index=False)
41+
42+
# Read with nrows=3 (should only get the first table)
43+
df1 = pd.read_excel(file1, nrows=3)
44+
df2 = pd.read_excel(file2, nrows=3)
45+
46+
# Expected result - just the first table
47+
expected = pd.DataFrame({"A": [1, 2, 3], "B": [4, 5, 6]})
48+
49+
# Verify results
50+
tm.assert_frame_equal(df1, expected)
51+
tm.assert_frame_equal(df2, expected)
52+
53+
# Verify shapes
54+
assert df1.shape == (3, 2)
55+
assert df2.shape == (3, 2)
56+
57+
# Verify last row doesn't contain headers from second table
58+
assert df2.iloc[-1, 0] == 3
59+
assert df2.iloc[-1, 1] == 6

0 commit comments

Comments
 (0)