fix for 61123 read_excel nrows param reads extra rows

zanuka · zanuka · commit 94fcb025decb · 2025-03-14T22:24:53.000-07:00
diff --git a/pandas/io/excel/_base.py b/pandas/io/excel/_base.py
@@ -696,6 +696,7 @@ def f(skiprows: Sequence, x: int) -> bool:
         # the number of rows read from file
         return None
 
+    # This method calculates how many rows to read from the file
     def parse(
         self,
         sheet_name: str | int | list[int] | list[str] | None = 0,
@@ -748,13 +749,15 @@ def parse(
             if verbose:
                 print(f"Reading sheet {asheetname}")
 
+            # Get the sheet object based on name or index
             if isinstance(asheetname, str):
                 sheet = self.get_sheet_by_name(asheetname)
             else:  # assume an integer if not a string
                 sheet = self.get_sheet_by_index(asheetname)
 
             file_rows_needed = self._calc_rows(header, index_col, skiprows, nrows)
             data = self.get_sheet_data(sheet, file_rows_needed)
+
             if hasattr(sheet, "close"):
                 # pyxlsb opens two TemporaryFiles
                 sheet.close()
@@ -764,6 +767,11 @@ def parse(
                 output[asheetname] = DataFrame()
                 continue
 
+            # Ensure we don't process more rows than requested with nrows
+            # This is a safeguard in case get_sheet_data returns more rows than requested
+            if nrows is not None and len(data) > nrows:
+                data = data[:nrows + (0 if header is None else header + 1)]
+
             output = self._parse_sheet(
                 data=data,
                 output=output,
diff --git a/pandas/io/excel/_openpyxl.py b/pandas/io/excel/_openpyxl.py
@@ -625,7 +625,10 @@ def get_sheet_data(
                 break
 
         # Trim trailing empty rows
-        data = data[: last_row_with_data + 1]
+        if file_rows_needed is None:
+            # Only trim trailing empty rows when file_rows_needed is None
+            # to ensure we return exactly file_rows_needed rows when specified
+            data = data[: last_row_with_data + 1]
 
         if len(data) > 0:
             # extend rows to max width
diff --git a/pandas/io/excel/_pyxlsb.py b/pandas/io/excel/_pyxlsb.py
@@ -124,4 +124,9 @@ def get_sheet_data(
                     data_row + (max_width - len(data_row)) * empty_cell
                     for data_row in data
                 ]
+
+        # Ensure we return exactly file_rows_needed rows if specified
+        if file_rows_needed is not None and len(data) > file_rows_needed:
+            data = data[:file_rows_needed]
+
         return data
diff --git a/pandas/io/excel/_xlrd.py b/pandas/io/excel/_xlrd.py
@@ -110,6 +110,7 @@ def _parse_cell(cell_contents, cell_typ):
                     cell_contents = time(
                         cell_contents.hour,
                         cell_contents.minute,
+            # xlrd implementation already correctly limits rows to file_rows_needed
                         cell_contents.second,
                         cell_contents.microsecond,
                     )
diff --git a/pandas/tests/io/excel/run_nrows_test.py b/pandas/tests/io/excel/run_nrows_test.py
@@ -0,0 +1,74 @@
+"""
+Standalone script to test nrows parameter with adjacent tables in Excel files.
+This script can be run directly with Python without using pytest.
+
+Usage:
+    python pandas/tests/io/excel/run_nrows_test.py
+"""
+import os
+import tempfile
+import pandas as pd
+
+
+def run_test():
+    """
+    Test that nrows parameter correctly handles adjacent tables.
+
+    This test creates two Excel files:
+    1. One with a blank row between two tables
+    2. One with no blank row between two tables
+
+    Then it verifies that reading with nrows=3 returns only the first table
+    in both cases.
+    """
+    # Create temporary directory
+    with tempfile.TemporaryDirectory() as tmp_dir:
+        # Create test files
+        file1 = os.path.join(tmp_dir, "with_blank.xlsx")
+        file2 = os.path.join(tmp_dir, "no_blank.xlsx")
+
+        # Create test data
+        df_upper = pd.DataFrame({"A": [1, 2, 3], "B": [4, 5, 6]})
+        df_lower = pd.DataFrame({"A": [7, 8, 9], "B": [10, 11, 12]})
+
+        print("Creating Excel files...")
+
+        # Create file with blank row between tables
+        with pd.ExcelWriter(file1) as writer:
+            df_upper.to_excel(writer, sheet_name="Sheet1", index=False)
+            # Add blank row by starting lower table at row 5 (0-based index + header)
+            df_lower.to_excel(writer, sheet_name="Sheet1", startrow=5, index=False)
+
+        # Create file with no blank row between tables
+        with pd.ExcelWriter(file2) as writer:
+            df_upper.to_excel(writer, sheet_name="Sheet1", index=False)
+            # No blank row, lower table starts right after (row 4 = header of second table)
+            df_lower.to_excel(writer, sheet_name="Sheet1", startrow=4, index=False)
+
+        print("Reading Excel files with nrows=3...")
+
+        # Read with nrows=3 (should only get the first table)
+        df1 = pd.read_excel(file1, nrows=3)
+        df2 = pd.read_excel(file2, nrows=3)
+
+        # Expected result - just the first table
+        expected = pd.DataFrame({"A": [1, 2, 3], "B": [4, 5, 6]})
+
+        # Verify results
+        print("Verifying results...")
+        pd.testing.assert_frame_equal(df1, expected)
+        pd.testing.assert_frame_equal(df2, expected)
+
+        # Verify shapes
+        assert df1.shape == (3, 2), f"Expected (3, 2) but got {df1.shape}"
+        assert df2.shape == (3, 2), f"Expected (3, 2) but got {df2.shape}"
+
+        # Verify last row doesn't contain headers from second table
+        assert df2.iloc[-1, 0] == 3, f"Expected 3 but got {df2.iloc[-1, 0]}"
+        assert df2.iloc[-1, 1] == 6, f"Expected 6 but got {df2.iloc[-1, 1]}"
+
+        print("All tests passed!")
+
+
+if __name__ == "__main__":
+    run_test()
diff --git a/pandas/tests/io/excel/test_adjacent_tables.py b/pandas/tests/io/excel/test_adjacent_tables.py
@@ -0,0 +1,64 @@
+from __future__ import annotations
+
+import pytest
+import pandas as pd
+import pandas._testing as tm
+
+from pandas.io.excel import ExcelWriter
+
+
+class TestAdjacentTables:
+    """Tests for reading Excel files with adjacent tables."""
+
+    @pytest.mark.parametrize(
+        "engine,read_ext",
+        [
+            pytest.param("openpyxl", ".xlsx", marks=[pytest.mark.skip_if_no("openpyxl")]),
+            pytest.param("xlsxwriter", ".xlsx", marks=[pytest.mark.skip_if_no("xlsxwriter")]),
+        ],
+    )
+    def test_excel_read_adjacent_tables_nrows(self, engine, read_ext, tmp_path):
+        """
+        Test that nrows parameter correctly handles adjacent tables with and without blank rows.
+
+        GH-61123
+        """
+        # Create test files with tables with and without blank rows between them
+        # File 1: Two tables with a blank row between
+        file1 = tmp_path / f"test1{read_ext}"
+        df_upper = pd.DataFrame({"A": [1, 2, 3], "B": [4, 5, 6]})
+        df_lower = pd.DataFrame({"A": [7, 8, 9], "B": [10, 11, 12]})
+
+        with ExcelWriter(file1, engine=engine) as writer:
+            df_upper.to_excel(writer, sheet_name="Sheet1", index=False)
+            # Add blank row by starting lower table at row 5 (0-based index + header)
+            df_lower.to_excel(writer, sheet_name="Sheet1", startrow=5, index=False)
+
+        # File 2: Two tables with no blank row
+        file2 = tmp_path / f"test2{read_ext}"
+        with ExcelWriter(file2, engine=engine) as writer:
+            df_upper.to_excel(writer, sheet_name="Sheet1", index=False)
+            # No blank row, lower table starts right after (row 4 = header of second table)
+            df_lower.to_excel(writer, sheet_name="Sheet1", startrow=4, index=False)
+
+        # Read first 3 rows (header + 3 data rows)
+        # Using nrows=3 to get exactly the upper table without blank rows
+        df1 = pd.read_excel(file1, header=0, nrows=3, engine=engine)
+        df2 = pd.read_excel(file2, header=0, nrows=3, engine=engine)
+
+        # Expected data - just the upper table
+        expected = pd.DataFrame({"A": [1, 2, 3], "B": [4, 5, 6]})
+
+        # Check content
+        tm.assert_frame_equal(df1, expected)
+        tm.assert_frame_equal(df2, expected)
+
+        # Verify we didn't read the header of the next table in df2
+        # If we did, the last row would contain column headers from the second table
+        assert df1.shape == (3, 2), f"Expected (3, 2) but got {df1.shape}"
+        assert df2.shape == (3, 2), f"Expected (3, 2) but got {df2.shape}"
+
+        # Fix the comparison warning by checking string values properly
+        last_row_values = [str(x) for x in df2.iloc[-1].values]
+        assert "A" not in last_row_values, "Second table header was incorrectly included"
+        assert "B" not in last_row_values, "Second table header was incorrectly included"
diff --git a/pandas/tests/io/excel/test_excel_adjacent_tables.py b/pandas/tests/io/excel/test_excel_adjacent_tables.py
@@ -0,0 +1,58 @@
+"""
+Tests for reading Excel files with adjacent tables.
+"""
+import pytest
+import pandas as pd
+import pandas._testing as tm
+
+
+class TestExcelAdjacentTables:
+    """Tests for reading Excel files with adjacent tables."""
+
+    @pytest.mark.parametrize("engine", ["openpyxl"])
+    def test_nrows_with_adjacent_tables(self, engine, tmp_path):
+        """
+        Test that nrows parameter correctly handles adjacent tables.
+
+        GH-61123: When using nrows to limit the number of rows read from an Excel file,
+        the function should correctly handle cases where tables are adjacent (no blank
+        row between them).
+        """
+        # Create test files with tables with and without blank rows between them
+        # File 1: Two tables with a blank row between
+        file1 = tmp_path / "test1.xlsx"
+        df_upper = pd.DataFrame({"A": [1, 2, 3], "B": [4, 5, 6]})
+        df_lower = pd.DataFrame({"A": [7, 8, 9], "B": [10, 11, 12]})
+
+        with pd.ExcelWriter(file1, engine=engine) as writer:
+            df_upper.to_excel(writer, sheet_name="Sheet1", index=False)
+            # Add blank row by starting lower table at row 5 (0-based index + header)
+            df_lower.to_excel(writer, sheet_name="Sheet1", startrow=5, index=False)
+
+        # File 2: Two tables with no blank row
+        file2 = tmp_path / "test2.xlsx"
+        with pd.ExcelWriter(file2, engine=engine) as writer:
+            df_upper.to_excel(writer, sheet_name="Sheet1", index=False)
+            # No blank row, lower table starts right after (row 4 = header of second table)
+            df_lower.to_excel(writer, sheet_name="Sheet1", startrow=4, index=False)
+
+        # Read first 3 rows (header + 3 data rows)
+        # Using nrows=3 to get exactly the upper table without blank rows
+        df1 = pd.read_excel(file1, header=0, nrows=3, engine=engine)
+        df2 = pd.read_excel(file2, header=0, nrows=3, engine=engine)
+
+        # Expected data - just the upper table
+        expected = pd.DataFrame({"A": [1, 2, 3], "B": [4, 5, 6]})
+
+        # Check content
+        tm.assert_frame_equal(df1, expected)
+        tm.assert_frame_equal(df2, expected)
+
+        # Verify we didn't read the header of the next table in df2
+        # If we did, the last row would contain column headers from the second table
+        assert df1.shape == (3, 2), f"Expected (3, 2) but got {df1.shape}"
+        assert df2.shape == (3, 2), f"Expected (3, 2) but got {df2.shape}"
+
+        # Check specific values in the last row to ensure we didn't read the header
+        assert df2.iloc[-1, 0] == 3, f"Expected 3 but got {df2.iloc[-1, 0]}"
+        assert df2.iloc[-1, 1] == 6, f"Expected 6 but got {df2.iloc[-1, 1]}"
diff --git a/pandas/tests/io/excel/test_minimal.py b/pandas/tests/io/excel/test_minimal.py
@@ -0,0 +1,54 @@
+"""
+Minimal test for reading Excel files with adjacent tables.
+"""
+import pytest
+import pandas as pd
+import pandas._testing as tm
+
+
+def test_nrows_with_adjacent_tables(tmp_path):
+    """
+    Test that nrows parameter correctly handles adjacent tables.
+
+    GH-61123: When using nrows to limit the number of rows read from an Excel file,
+    the function should correctly handle cases where tables are adjacent (no blank
+    row between them).
+    """
+    # Create test files with tables with and without blank rows between them
+    # File 1: Two tables with a blank row between
+    file1 = tmp_path / "test1.xlsx"
+    df_upper = pd.DataFrame({"A": [1, 2, 3], "B": [4, 5, 6]})
+    df_lower = pd.DataFrame({"A": [7, 8, 9], "B": [10, 11, 12]})
+
+    with pd.ExcelWriter(file1) as writer:
+        df_upper.to_excel(writer, sheet_name="Sheet1", index=False)
+        # Add blank row by starting lower table at row 5 (0-based index + header)
+        df_lower.to_excel(writer, sheet_name="Sheet1", startrow=5, index=False)
+
+    # File 2: Two tables with no blank row
+    file2 = tmp_path / "test2.xlsx"
+    with pd.ExcelWriter(file2) as writer:
+        df_upper.to_excel(writer, sheet_name="Sheet1", index=False)
+        # No blank row, lower table starts right after (row 4 = header of second table)
+        df_lower.to_excel(writer, sheet_name="Sheet1", startrow=4, index=False)
+
+    # Read first 3 rows (header + 3 data rows)
+    # Using nrows=3 to get exactly the upper table without blank rows
+    df1 = pd.read_excel(file1, header=0, nrows=3)
+    df2 = pd.read_excel(file2, header=0, nrows=3)
+
+    # Expected data - just the upper table
+    expected = pd.DataFrame({"A": [1, 2, 3], "B": [4, 5, 6]})
+
+    # Check content
+    tm.assert_frame_equal(df1, expected)
+    tm.assert_frame_equal(df2, expected)
+
+    # Verify we didn't read the header of the next table in df2
+    # If we did, the last row would contain column headers from the second table
+    assert df1.shape == (3, 2)
+    assert df2.shape == (3, 2)
+
+    # Check specific values in the last row to ensure we didn't read the header
+    assert df2.iloc[-1, 0] == 3
+    assert df2.iloc[-1, 1] == 6
diff --git a/pandas/tests/io/excel/test_nrows_adjacent.py b/pandas/tests/io/excel/test_nrows_adjacent.py
@@ -0,0 +1,59 @@
+"""
+Test for GH-61123: nrows parameter with adjacent tables in Excel files.
+"""
+import os
+import pytest
+import pandas as pd
+import pandas._testing as tm
+
+
+@pytest.mark.skipif(not os.path.exists("pandas/io/excel/_openpyxl.py"), reason="openpyxl not installed")
+def test_nrows_with_adjacent_tables(tmp_path):
+    """
+    Test that nrows parameter correctly handles adjacent tables.
+
+    This test creates two Excel files:
+    1. One with a blank row between two tables
+    2. One with no blank row between two tables
+
+    Then it verifies that reading with nrows=3 returns only the first table
+    in both cases.
+    """
+    # Create test files
+    file1 = tmp_path / "with_blank.xlsx"
+    file2 = tmp_path / "no_blank.xlsx"
+
+    # Create test data
+    df_upper = pd.DataFrame({"A": [1, 2, 3], "B": [4, 5, 6]})
+    df_lower = pd.DataFrame({"A": [7, 8, 9], "B": [10, 11, 12]})
+
+    # Create file with blank row between tables
+    with pd.ExcelWriter(file1) as writer:
+        df_upper.to_excel(writer, sheet_name="Sheet1", index=False)
+        # Add blank row by starting lower table at row 5 (0-based index + header)
+        df_lower.to_excel(writer, sheet_name="Sheet1", startrow=5, index=False)
+
+    # Create file with no blank row between tables
+    with pd.ExcelWriter(file2) as writer:
+        df_upper.to_excel(writer, sheet_name="Sheet1", index=False)
+        # No blank row, lower table starts right after (row 4 = header of second table)
+        df_lower.to_excel(writer, sheet_name="Sheet1", startrow=4, index=False)
+
+    # Read with nrows=3 (should only get the first table)
+    df1 = pd.read_excel(file1, nrows=3)
+    df2 = pd.read_excel(file2, nrows=3)
+
+    # Expected result - just the first table
+    expected = pd.DataFrame({"A": [1, 2, 3], "B": [4, 5, 6]})
+
+    # Verify results
+    tm.assert_frame_equal(df1, expected)
+    tm.assert_frame_equal(df2, expected)
+
+    # Verify shapes
+    assert df1.shape == (3, 2)
+    assert df2.shape == (3, 2)
+
+    # Verify last row doesn't contain headers from second table
+    assert df2.iloc[-1, 0] == 3
+    assert df2.iloc[-1, 1] == 6
diff --git a/pandas/tests/io/excel/test_readers.py b/pandas/tests/io/excel/test_readers.py
diff --git a/test_adjacent_tables.py b/test_adjacent_tables.py

Original file line number	Diff line number	Diff line change
`@@ -110,6 +110,7 @@ def _parse_cell(cell_contents, cell_typ):`
`110`	`110`	`cell_contents = time(`
`111`	`111`	`cell_contents.hour,`
`112`	`112`	`cell_contents.minute,`
	`113`	`+ # xlrd implementation already correctly limits rows to file_rows_needed`
`113`	`114`	`cell_contents.second,`
`114`	`115`	`cell_contents.microsecond,`
`115`	`116`	`)`