fix: pandas was raising when index name and column names overlapped in groupby (#1908)

MarcoGorelli · web-flow · commit 4ce9b0f0f2ff · 2025-02-01T13:53:23.000Z
diff --git a/README.md b/README.md
@@ -10,6 +10,7 @@
 [![PyPI version](https://badge.fury.io/py/narwhals.svg)](https://badge.fury.io/py/narwhals)
 [![Downloads](https://static.pepy.tech/badge/narwhals/month)](https://pepy.tech/project/narwhals)
 [![Trusted publishing](https://img.shields.io/badge/Trusted_publishing-Provides_attestations-bright_green)](https://peps.python.org/pep-0740/)
+[![PYPI - Types](https://img.shields.io/pypi/types/narwhals)](https://pypi.org/project/narwhals)
 
 Extremely lightweight and extensible compatibility layer between dataframe libraries!
 
diff --git a/docs/pandas_like_concepts/pandas_index.md b/docs/pandas_like_concepts/pandas_index.md
@@ -16,7 +16,7 @@ Narwhals aims to accommodate both!
 
 Let's learn about what Narwhals promises.
 
-## 1. Narwhals will preserve your index for dataframe operations
+## 1. Narwhals will preserve your index for common dataframe operations
 
 ```python exec="1" source="above" session="ex1"
 import narwhals as nw
@@ -39,7 +39,10 @@ print(my_func(df))
 ```
 
 Note how the result still has the original index - Narwhals did not modify
-it.
+it. Narwhals will preserve your original index for most common dataframe
+operations. However, Narwhals will _not_ preserve the original index for
+`DataFrame.group_by`, because there, overlapping index and column names
+raise errors.
 
 ## 2. Index alignment follows the left-hand-rule
 
diff --git a/narwhals/_pandas_like/group_by.py b/narwhals/_pandas_like/group_by.py
@@ -43,32 +43,27 @@ def __init__(
     ) -> None:
         self._df = df
         self._keys = keys
+        # Drop index to avoid potential collisions:
+        # https://github.com/narwhals-dev/narwhals/issues/1907.
+        native_frame = df._native_frame.reset_index(drop=True)
         if (
             self._df._implementation is Implementation.PANDAS
             and self._df._backend_version < (1, 1)
         ):  # pragma: no cover
             if (
                 not drop_null_keys
-                and select_columns_by_name(
-                    self._df._native_frame,
-                    self._keys,
-                    self._df._backend_version,
-                    self._df._implementation,
-                )
-                .isna()
-                .any()
-                .any()
+                and self._df.simple_select(*self._keys)._native_frame.isna().any().any()
             ):
-                msg = "Grouping by null values is not supported in pandas < 1.0.0"
+                msg = "Grouping by null values is not supported in pandas < 1.1.0"
                 raise NotImplementedError(msg)
-            self._grouped = self._df._native_frame.groupby(
+            self._grouped = native_frame.groupby(
                 list(self._keys),
                 sort=False,
                 as_index=True,
                 observed=True,
             )
         else:
-            self._grouped = self._df._native_frame.groupby(
+            self._grouped = native_frame.groupby(
                 list(self._keys),
                 sort=False,
                 as_index=True,
diff --git a/narwhals/typing.py b/narwhals/typing.py
@@ -60,19 +60,14 @@ def simple_select(
     ) -> CompliantDataFrame: ...  # `select` where all args are column names
 
 
-class CompliantLazyFramme(Protocol):
-    def __narwhals_lazyframe__(self) -> CompliantDataFrame: ...
+class CompliantLazyFrame(Protocol):
+    def __narwhals_lazyframe__(self) -> CompliantLazyFrame: ...
     def __narwhals_namespace__(self) -> Any: ...
     def simple_select(
         self, *column_names: str
     ) -> CompliantLazyFrame: ...  # `select` where all args are column names
 
 
-class CompliantLazyFrame(Protocol):
-    def __narwhals_lazyframe__(self) -> CompliantLazyFrame: ...
-    def __narwhals_namespace__(self) -> Any: ...
-
-
 CompliantSeriesT_co = TypeVar(
     "CompliantSeriesT_co", bound=CompliantSeries, covariant=True
 )
diff --git a/tests/conftest.py b/tests/conftest.py
@@ -210,7 +210,7 @@ def sqlframe_pyspark_lazy_constructor(
     "pyspark": pyspark_lazy_constructor,  # type: ignore[dict-item]
     # We've reported several bugs to sqlframe - once they address
     # them, we can start testing them as part of our CI.
-    # "sqlframe": pyspark_lazy_constructor,  # noqa: ERA001
+    # "sqlframe": sqlframe_pyspark_lazy_constructor,  # noqa: ERA001
 }
 GPU_CONSTRUCTORS: dict[str, Callable[[Any], IntoFrame]] = {"cudf": cudf_constructor}
 
diff --git a/tests/group_by_test.py b/tests/group_by_test.py
@@ -406,3 +406,17 @@ def test_group_by_expr(constructor: Constructor) -> None:
     df = nw.from_native(constructor({"a": [1, 1, 3], "b": [4, 5, 6]}))
     with pytest.raises(NotImplementedError, match=r"not \(yet\?\) supported"):
         df.group_by(nw.col("a")).agg(nw.col("b").mean())  # type: ignore[arg-type]
+
+
+def test_pandas_group_by_index_and_column_overlap() -> None:
+    df = pd.DataFrame(
+        {"a": [1, 1, 2], "b": [4, 5, 6]}, index=pd.Index([0, 1, 2], name="a")
+    )
+    result = nw.from_native(df, eager_only=True).group_by("a").agg(nw.col("b").mean())
+    expected = {"a": [1, 2], "b": [4.5, 6.0]}
+    assert_equal_data(result, expected)
+
+    key, result = next(iter(nw.from_native(df, eager_only=True).group_by("a")))
+    assert key == (1,)
+    expected_native = pd.DataFrame({"a": [1, 1], "b": [4, 5]})
+    pd.testing.assert_frame_equal(result.to_native(), expected_native)

Original file line number	Diff line number	Diff line change
`@@ -210,7 +210,7 @@ def sqlframe_pyspark_lazy_constructor(`
`210`	`210`	`"pyspark": pyspark_lazy_constructor, # type: ignore[dict-item]`
`211`	`211`	`# We've reported several bugs to sqlframe - once they address`
`212`	`212`	`# them, we can start testing them as part of our CI.`
`213`		`- # "sqlframe": pyspark_lazy_constructor, # noqa: ERA001`
	`213`	`+ # "sqlframe": sqlframe_pyspark_lazy_constructor, # noqa: ERA001`
`214`	`214`	`}`
`215`	`215`	`GPU_CONSTRUCTORS: dict[str, Callable[[Any], IntoFrame]] = {"cudf": cudf_constructor}`
`216`	`216`