Skip to content

Commit d5793fd

Browse files
feat(python): large speedup for df.iterrows (~200-400%) (#5979)
1 parent 9c3a659 commit d5793fd

File tree

2 files changed

+71
-20
lines changed

2 files changed

+71
-20
lines changed

py-polars/polars/internals/dataframe/frame.py

Lines changed: 46 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -6447,6 +6447,11 @@ def rows(self, named: bool = False) -> list[tuple[Any, ...]] | list[Any]:
64476447
Return named tuples instead of regular tuples. This is more expensive than
64486448
returning regular tuples, but allows for accessing values by column name.
64496449
6450+
Warnings
6451+
--------
6452+
Row-iteration is not optimal as the underlying data is stored in columnar form;
6453+
where possible, prefer export via one of the dedicated export/output methods.
6454+
64506455
Examples
64516456
--------
64526457
>>> df = pl.DataFrame(
@@ -6460,6 +6465,10 @@ def rows(self, named: bool = False) -> list[tuple[Any, ...]] | list[Any]:
64606465
>>> df.rows(named=True)
64616466
[Row(a=1, b=2), Row(a=3, b=4), Row(a=5, b=6)]
64626467
6468+
See Also
6469+
--------
6470+
iterrows : row iterator over frame data (does not materialise all rows).
6471+
64636472
"""
64646473
if named:
64656474
Row = namedtuple("Row", self.columns) # type: ignore[misc]
@@ -6468,15 +6477,19 @@ def rows(self, named: bool = False) -> list[tuple[Any, ...]] | list[Any]:
64686477
return self._df.row_tuples()
64696478

64706479
@overload
6471-
def iterrows(self, named: Literal[False] = ...) -> Iterator[tuple[Any, ...]]:
6480+
def iterrows(
6481+
self, named: Literal[False] = ..., buffer_size: int = ...
6482+
) -> Iterator[tuple[Any, ...]]:
64726483
...
64736484

64746485
@overload
6475-
def iterrows(self, named: Literal[True] = ...) -> Iterator[Any]:
6486+
def iterrows(
6487+
self, named: Literal[True] = ..., buffer_size: int = ...
6488+
) -> Iterator[Any]:
64766489
...
64776490

64786491
def iterrows(
6479-
self, named: bool = False
6492+
self, named: bool = False, buffer_size: int = 500
64806493
) -> Iterator[tuple[Any, ...]] | Iterator[Any]:
64816494
"""
64826495
Returns an iterator over the rows in the DataFrame.
@@ -6487,9 +6500,22 @@ def iterrows(
64876500
Return named tuples instead of regular tuples. This is more expensive than
64886501
returning regular tuples, but allows for accessing values by column name.
64896502
6503+
buffer_size
6504+
Determines the number of rows that are buffered internally while iterating
6505+
over the data; you should only modify this in very specific cases where the
6506+
default value is determined not to be a good fit to your access pattern, as
6507+
the speedup from using the buffer is significant (~2-4x). Setting this
6508+
value to zero disables row buffering.
6509+
64906510
Warnings
64916511
--------
6492-
This is very expensive and should not be used in any performance critical code!
6512+
Row-iteration is not optimal as the underlying data is stored in columnar form;
6513+
where possible, prefer export via one of the dedicated export/output methods.
6514+
6515+
Notes
6516+
-----
6517+
If you are planning to materialise all frame data at once you should prefer
6518+
calling ``rows()``, which will be faster.
64936519
64946520
Examples
64956521
--------
@@ -6504,9 +6530,25 @@ def iterrows(
65046530
>>> [row.b for row in df.iterrows(named=True)]
65056531
[2, 4, 6]
65066532
6533+
See Also
6534+
--------
6535+
rows : materialises all frame data as a list of rows.
6536+
65076537
"""
6538+
# note: buffering rows results in a 2-4x speedup over individual calls
6539+
# to ".row(i)", so it should only be disabled in extremely specific cases.
65086540
if named:
65096541
Row = namedtuple("Row", self.columns) # type: ignore[misc]
6542+
if buffer_size:
6543+
for offset in range(0, self.height, buffer_size):
6544+
rows_chunk = self.slice(offset, buffer_size).rows(named=False)
6545+
if named:
6546+
for row in rows_chunk:
6547+
yield Row(*row)
6548+
else:
6549+
yield from rows_chunk
6550+
6551+
elif named:
65106552
for i in range(self.height):
65116553
yield Row(*self.row(i))
65126554
else:

py-polars/tests/unit/test_rows.py

Lines changed: 25 additions & 16 deletions
Original file line numberDiff line numberDiff line change
@@ -68,26 +68,35 @@ def test_rows() -> None:
6868
def test_iterrows() -> None:
6969
df = pl.DataFrame({"a": [1, 2, 3], "b": [None, False, None]})
7070

71-
# Regular iterrows
71+
# Default iterrows behaviour
7272
it = df.iterrows()
7373
assert next(it) == (1, None)
7474
assert next(it) == (2, False)
7575
assert next(it) == (3, None)
7676
with pytest.raises(StopIteration):
7777
next(it)
7878

79-
# Named iterrows
80-
it_named = df.iterrows(named=True)
81-
82-
row = next(it_named)
83-
assert row.a == 1
84-
assert row.b is None
85-
row = next(it_named)
86-
assert row.a == 2
87-
assert row.b is False
88-
row = next(it_named)
89-
assert row.a == 3
90-
assert row.b is None
91-
92-
with pytest.raises(StopIteration):
93-
next(it_named)
79+
# Apply explicit row-buffer size
80+
for sz in (0, 1, 2, 3, 4):
81+
it = df.iterrows(buffer_size=sz)
82+
assert next(it) == (1, None)
83+
assert next(it) == (2, False)
84+
assert next(it) == (3, None)
85+
with pytest.raises(StopIteration):
86+
next(it)
87+
88+
# Return rows as namedtuples
89+
it_named = df.iterrows(named=True, buffer_size=sz)
90+
91+
row = next(it_named)
92+
assert row.a == 1
93+
assert row.b is None
94+
row = next(it_named)
95+
assert row.a == 2
96+
assert row.b is False
97+
row = next(it_named)
98+
assert row.a == 3
99+
assert row.b is None
100+
101+
with pytest.raises(StopIteration):
102+
next(it_named)

0 commit comments

Comments
 (0)