Skip to content

Commit 4ce9b0f

Browse files
authored
fix: pandas was raising when index name and column names overlapped in groupby (#1908)
1 parent 5a8668d commit 4ce9b0f

File tree

6 files changed

+30
-22
lines changed

6 files changed

+30
-22
lines changed

README.md

+1
Original file line numberDiff line numberDiff line change
@@ -10,6 +10,7 @@
1010
[![PyPI version](https://badge.fury.io/py/narwhals.svg)](https://badge.fury.io/py/narwhals)
1111
[![Downloads](https://static.pepy.tech/badge/narwhals/month)](https://pepy.tech/project/narwhals)
1212
[![Trusted publishing](https://img.shields.io/badge/Trusted_publishing-Provides_attestations-bright_green)](https://peps.python.org/pep-0740/)
13+
[![PYPI - Types](https://img.shields.io/pypi/types/narwhals)](https://pypi.org/project/narwhals)
1314

1415
Extremely lightweight and extensible compatibility layer between dataframe libraries!
1516

docs/pandas_like_concepts/pandas_index.md

+5-2
Original file line numberDiff line numberDiff line change
@@ -16,7 +16,7 @@ Narwhals aims to accommodate both!
1616

1717
Let's learn about what Narwhals promises.
1818

19-
## 1. Narwhals will preserve your index for dataframe operations
19+
## 1. Narwhals will preserve your index for common dataframe operations
2020

2121
```python exec="1" source="above" session="ex1"
2222
import narwhals as nw
@@ -39,7 +39,10 @@ print(my_func(df))
3939
```
4040

4141
Note how the result still has the original index - Narwhals did not modify
42-
it.
42+
it. Narwhals will preserve your original index for most common dataframe
43+
operations. However, Narwhals will _not_ preserve the original index for
44+
`DataFrame.group_by`, because there, overlapping index and column names
45+
raise errors.
4346

4447
## 2. Index alignment follows the left-hand-rule
4548

narwhals/_pandas_like/group_by.py

+7-12
Original file line numberDiff line numberDiff line change
@@ -43,32 +43,27 @@ def __init__(
4343
) -> None:
4444
self._df = df
4545
self._keys = keys
46+
# Drop index to avoid potential collisions:
47+
# https://github.com/narwhals-dev/narwhals/issues/1907.
48+
native_frame = df._native_frame.reset_index(drop=True)
4649
if (
4750
self._df._implementation is Implementation.PANDAS
4851
and self._df._backend_version < (1, 1)
4952
): # pragma: no cover
5053
if (
5154
not drop_null_keys
52-
and select_columns_by_name(
53-
self._df._native_frame,
54-
self._keys,
55-
self._df._backend_version,
56-
self._df._implementation,
57-
)
58-
.isna()
59-
.any()
60-
.any()
55+
and self._df.simple_select(*self._keys)._native_frame.isna().any().any()
6156
):
62-
msg = "Grouping by null values is not supported in pandas < 1.0.0"
57+
msg = "Grouping by null values is not supported in pandas < 1.1.0"
6358
raise NotImplementedError(msg)
64-
self._grouped = self._df._native_frame.groupby(
59+
self._grouped = native_frame.groupby(
6560
list(self._keys),
6661
sort=False,
6762
as_index=True,
6863
observed=True,
6964
)
7065
else:
71-
self._grouped = self._df._native_frame.groupby(
66+
self._grouped = native_frame.groupby(
7267
list(self._keys),
7368
sort=False,
7469
as_index=True,

narwhals/typing.py

+2-7
Original file line numberDiff line numberDiff line change
@@ -60,19 +60,14 @@ def simple_select(
6060
) -> CompliantDataFrame: ... # `select` where all args are column names
6161

6262

63-
class CompliantLazyFramme(Protocol):
64-
def __narwhals_lazyframe__(self) -> CompliantDataFrame: ...
63+
class CompliantLazyFrame(Protocol):
64+
def __narwhals_lazyframe__(self) -> CompliantLazyFrame: ...
6565
def __narwhals_namespace__(self) -> Any: ...
6666
def simple_select(
6767
self, *column_names: str
6868
) -> CompliantLazyFrame: ... # `select` where all args are column names
6969

7070

71-
class CompliantLazyFrame(Protocol):
72-
def __narwhals_lazyframe__(self) -> CompliantLazyFrame: ...
73-
def __narwhals_namespace__(self) -> Any: ...
74-
75-
7671
CompliantSeriesT_co = TypeVar(
7772
"CompliantSeriesT_co", bound=CompliantSeries, covariant=True
7873
)

tests/conftest.py

+1-1
Original file line numberDiff line numberDiff line change
@@ -210,7 +210,7 @@ def sqlframe_pyspark_lazy_constructor(
210210
"pyspark": pyspark_lazy_constructor, # type: ignore[dict-item]
211211
# We've reported several bugs to sqlframe - once they address
212212
# them, we can start testing them as part of our CI.
213-
# "sqlframe": pyspark_lazy_constructor, # noqa: ERA001
213+
# "sqlframe": sqlframe_pyspark_lazy_constructor, # noqa: ERA001
214214
}
215215
GPU_CONSTRUCTORS: dict[str, Callable[[Any], IntoFrame]] = {"cudf": cudf_constructor}
216216

tests/group_by_test.py

+14
Original file line numberDiff line numberDiff line change
@@ -406,3 +406,17 @@ def test_group_by_expr(constructor: Constructor) -> None:
406406
df = nw.from_native(constructor({"a": [1, 1, 3], "b": [4, 5, 6]}))
407407
with pytest.raises(NotImplementedError, match=r"not \(yet\?\) supported"):
408408
df.group_by(nw.col("a")).agg(nw.col("b").mean()) # type: ignore[arg-type]
409+
410+
411+
def test_pandas_group_by_index_and_column_overlap() -> None:
412+
df = pd.DataFrame(
413+
{"a": [1, 1, 2], "b": [4, 5, 6]}, index=pd.Index([0, 1, 2], name="a")
414+
)
415+
result = nw.from_native(df, eager_only=True).group_by("a").agg(nw.col("b").mean())
416+
expected = {"a": [1, 2], "b": [4.5, 6.0]}
417+
assert_equal_data(result, expected)
418+
419+
key, result = next(iter(nw.from_native(df, eager_only=True).group_by("a")))
420+
assert key == (1,)
421+
expected_native = pd.DataFrame({"a": [1, 1], "b": [4, 5]})
422+
pd.testing.assert_frame_equal(result.to_native(), expected_native)

0 commit comments

Comments
 (0)