Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Bug: Save original index and remap after function completes #61116

Open
wants to merge 6 commits into
base: main
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 3 additions & 0 deletions doc/source/whatsnew/v3.0.0.rst
Original file line number Diff line number Diff line change
Expand Up @@ -71,6 +71,7 @@ Other enhancements
- :meth:`DataFrameGroupBy.transform`, :meth:`SeriesGroupBy.transform`, :meth:`DataFrameGroupBy.agg`, :meth:`SeriesGroupBy.agg`, :meth:`RollingGroupby.apply`, :meth:`ExpandingGroupby.apply`, :meth:`Rolling.apply`, :meth:`Expanding.apply`, :meth:`DataFrame.apply` with ``engine="numba"`` now supports positional arguments passed as kwargs (:issue:`58995`)
- :meth:`Rolling.agg`, :meth:`Expanding.agg` and :meth:`ExponentialMovingWindow.agg` now accept :class:`NamedAgg` aggregations through ``**kwargs`` (:issue:`28333`)
- :meth:`Series.map` can now accept kwargs to pass on to func (:issue:`59814`)
- :meth:`Series.nlargest` has improved performance when there are duplicate values in the index (:issue:`55767`)
- :meth:`Series.str.get_dummies` now accepts a ``dtype`` parameter to specify the dtype of the resulting DataFrame (:issue:`47872`)
- :meth:`pandas.concat` will raise a ``ValueError`` when ``ignore_index=True`` and ``keys`` is not ``None`` (:issue:`59274`)
- :py:class:`frozenset` elements in pandas objects are now natively printed (:issue:`60690`)
Expand Down Expand Up @@ -151,6 +152,8 @@ These improvements also fixed certain bugs in groupby:
- :meth:`.DataFrameGroupBy.sum` would have incorrect values when there are multiple groupings, unobserved groups, and non-numeric data (:issue:`43891`)
- :meth:`.DataFrameGroupBy.value_counts` would produce incorrect results when used with some categorical and some non-categorical groupings and ``observed=False`` (:issue:`56016`)

- :meth:`Series.nlargest`

.. _whatsnew_300.notable_bug_fixes.notable_bug_fix2:

notable_bug_fix2
Expand Down
36 changes: 28 additions & 8 deletions pandas/core/methods/selectn.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,7 @@
from typing import (
TYPE_CHECKING,
Generic,
Literal,
cast,
final,
)
Expand Down Expand Up @@ -54,7 +55,9 @@


class SelectN(Generic[NDFrameT]):
def __init__(self, obj: NDFrameT, n: int, keep: str) -> None:
def __init__(
self, obj: NDFrameT, n: int, keep: Literal["first", "last", "all"]
) -> None:
self.obj = obj
self.n = n
self.keep = keep
Expand Down Expand Up @@ -111,13 +114,22 @@ def compute(self, method: str) -> Series:
if n <= 0:
return self.obj[[]]

dropped = self.obj.dropna()
nan_index = self.obj.drop(dropped.index)
# Save index and reset to default index to avoid performance impact
# from when index contains duplicates
original_index: Index = self.obj.index
cur_series = self.obj.reset_index(drop=True)

# slow method
if n >= len(self.obj):
if n >= len(cur_series):
ascending = method == "nsmallest"
return self.obj.sort_values(ascending=ascending).head(n)
final_series = cur_series.sort_values(
ascending=ascending, kind="stable"
).head(n)
final_series.index = original_index.take(final_series.index)
return final_series

dropped = cur_series.dropna()
nan_index = cur_series.drop(dropped.index)

# fast method
new_dtype = dropped.dtype
Expand Down Expand Up @@ -173,7 +185,9 @@ def compute(self, method: str) -> Series:
# reverse indices
inds = narr - 1 - inds

return concat([dropped.iloc[inds], nan_index]).iloc[:findex]
final_series = concat([dropped.iloc[inds], nan_index]).iloc[:findex]
final_series.index = original_index.take(final_series.index)
return final_series


class SelectNFrame(SelectN[DataFrame]):
Expand All @@ -192,7 +206,13 @@ class SelectNFrame(SelectN[DataFrame]):
nordered : DataFrame
"""

def __init__(self, obj: DataFrame, n: int, keep: str, columns: IndexLabel) -> None:
def __init__(
self,
obj: DataFrame,
n: int,
keep: Literal["first", "last", "all"],
columns: IndexLabel,
) -> None:
super().__init__(obj, n, keep)
if not is_list_like(columns) or isinstance(columns, tuple):
columns = [columns]
Expand Down Expand Up @@ -277,4 +297,4 @@ def get_indexer(current_indexer: Index, other_indexer: Index) -> Index:

ascending = method == "nsmallest"

return frame.sort_values(columns, ascending=ascending, kind="mergesort")
return frame.sort_values(columns, ascending=ascending, kind="stable")
4 changes: 2 additions & 2 deletions pandas/tests/frame/methods/test_nlargest.py
Original file line number Diff line number Diff line change
Expand Up @@ -153,11 +153,11 @@ def test_nlargest_n_duplicate_index(self, n, order, request):
index=[0, 0, 1, 1, 1],
)
result = df.nsmallest(n, order)
expected = df.sort_values(order).head(n)
expected = df.sort_values(order, kind="stable").head(n)
tm.assert_frame_equal(result, expected)

result = df.nlargest(n, order)
expected = df.sort_values(order, ascending=False).head(n)
expected = df.sort_values(order, ascending=False, kind="stable").head(n)
if Version(np.__version__) >= Version("1.25") and (
(order == ["a"] and n in (1, 2, 3, 4)) or ((order == ["a", "b"]) and n == 5)
):
Expand Down
Loading