narwhals-dev · EdAbati · May 4, 2025 · May 4, 2025 · May 4, 2025 · May 6, 2025
diff --git a/narwhals/_duckdb/dataframe.py b/narwhals/_duckdb/dataframe.py
@@ -184,7 +184,10 @@ def select(
         *exprs: DuckDBExpr,
     ) -> Self:
         selection = (val.alias(name) for name, val in evaluate_exprs(self, *exprs))
-        return self._with_native(self.native.select(*selection))
+        try:
+            return self._with_native(self.native.select(*selection))
+        except duckdb.BinderException as e:
+            raise ColumnNotFoundError.from_available_column_names(self.columns) from e
 def _to_arrow_schema(self) -> pa.Schema:  # pragma: no cover 
     import pyarrow as pa  # ignore-banned-import 
     from narwhals._arrow.utils import narwhals_to_native_dtype 
     schema: list[tuple[str, pa.DataType]] = [] 
     nw_schema = self.collect_schema() 
     native_schema = self.native.schema 
     for key, value in nw_schema.items(): 
         try: 
             native_dtype = narwhals_to_native_dtype(value, self._version) 
         except Exception as exc:  # noqa: BLE001,PERF203 
             native_spark_dtype = native_schema[key].dataType  # type: ignore[index] 
 if flat_exprs and all(isinstance(x, str) for x in flat_exprs) and not named_exprs: 
     # fast path! 
     try: 
         return self._with_compliant( 
             self._compliant_frame.simple_select(*flat_exprs) 
         ) 
     except Exception as e: 
         # Column not found is the only thing that can realistically be raised here. 
         available_columns = self.columns 
         missing_columns = [x for x in flat_exprs if x not in available_columns] 
         raise ColumnNotFoundError.from_missing_and_available_column_names( 
             missing_columns, available_columns 
         ) from e 
 if isinstance(df, nw.LazyFrame): 
     with pytest.raises(ColumnNotFoundError, match=msg): 
         df.select(selected_columns).collect() 
 else: 
     with pytest.raises(ColumnNotFoundError, match=msg): 
         df.select(selected_columns) 
 def _to_arrow_schema(self) -> pa.Schema:  # pragma: no cover 
     import pyarrow as pa  # ignore-banned-import 
  
     from narwhals._arrow.utils import narwhals_to_native_dtype 
  
     schema: list[tuple[str, pa.DataType]] = [] 
     nw_schema = self.collect_schema() 
     native_schema = self.native.schema 
     for key, value in nw_schema.items(): 
         try: 
             native_dtype = narwhals_to_native_dtype(value, self._version) 
         except Exception as exc:  # noqa: BLE001,PERF203 
             native_spark_dtype = native_schema[key].dataType  # type: ignore[index] 
 if flat_exprs and all(isinstance(x, str) for x in flat_exprs) and not named_exprs: 
     # fast path! 
     try: 
         return self._with_compliant( 
             self._compliant_frame.simple_select(*flat_exprs) 
         ) 
     except Exception as e: 
         # Column not found is the only thing that can realistically be raised here. 
         available_columns = self.columns 
         missing_columns = [x for x in flat_exprs if x not in available_columns] 
         raise ColumnNotFoundError.from_missing_and_available_column_names( 
             missing_columns, available_columns 
         ) from e 
 if isinstance(df, nw.LazyFrame): 
     with pytest.raises(ColumnNotFoundError, match=msg): 
         df.select(selected_columns).collect() 
 else: 
     with pytest.raises(ColumnNotFoundError, match=msg): 
         df.select(selected_columns) 
 
     def drop(self, columns: Sequence[str], *, strict: bool) -> Self:
         columns_to_drop = parse_columns_to_drop(self, columns=columns, strict=strict)

diff --git a/narwhals/_spark_like/dataframe.py b/narwhals/_spark_like/dataframe.py
@@ -15,6 +15,7 @@
 from narwhals._spark_like.utils import import_native_dtypes
 from narwhals._spark_like.utils import import_window
 from narwhals._spark_like.utils import native_to_narwhals_dtype
+from narwhals.exceptions import ColumnNotFoundError
 from narwhals.exceptions import InvalidOperationError
 from narwhals.typing import CompliantLazyFrame
 from narwhals.utils import Implementation
@@ -274,6 +275,13 @@ def select(
     ) -> Self:
         new_columns = evaluate_exprs(self, *exprs)
         new_columns_list = [col.alias(col_name) for (col_name, col) in new_columns]
+        if self._implementation.is_pyspark():  # pragma: no cover
+            from pyspark.errors import AnalysisException
+
+            try:
+                return self._with_native(self.native.select(*new_columns_list))
+            except AnalysisException as e:
+                raise ColumnNotFoundError.from_available_column_names(self.columns) from e
         return self._with_native(self.native.select(*new_columns_list))
 
     def with_columns(self, *exprs: SparkLikeExpr) -> Self:

diff --git a/narwhals/exceptions.py b/narwhals/exceptions.py
@@ -38,6 +38,16 @@ def from_missing_and_available_column_names(
         )
         return ColumnNotFoundError(message)
 
+    @classmethod
+    def from_available_column_names(
+        cls: type, available_columns: list[str]
+    ) -> ColumnNotFoundError:
+        message = (
+            "The selected columns were not found."
+            f"\n\nHint: Did you mean one of these columns: {available_columns}?"
+        )
+        return ColumnNotFoundError(message)
+
 
 class ComputeError(NarwhalsError):
     """Exception raised when the underlying computation could not be evaluated."""

diff --git a/tests/conftest.py b/tests/conftest.py
@@ -27,6 +27,7 @@
     from narwhals.typing import NativeLazyFrame
     from tests.utils import Constructor
     from tests.utils import ConstructorEager
+    from tests.utils import ConstructorLazy
 
     Data: TypeAlias = "dict[str, list[Any]]"
 
@@ -227,7 +228,7 @@ def sqlframe_pyspark_lazy_constructor(obj: Data) -> SQLFrameDataFrame:  # pragma
     "cudf": cudf_constructor,
     "polars[eager]": polars_eager_constructor,
 }
-LAZY_CONSTRUCTORS: dict[str, Constructor] = {
+LAZY_CONSTRUCTORS: dict[str, ConstructorLazy] = {
     "dask": dask_lazy_p2_constructor,
     "polars[lazy]": polars_lazy_constructor,
     "duckdb": duckdb_lazy_constructor,
@@ -259,6 +260,8 @@ def pytest_generate_tests(metafunc: pytest.Metafunc) -> None:
 
     eager_constructors: list[ConstructorEager] = []
     eager_constructors_ids: list[str] = []
+    lazy_constructors: list[ConstructorLazy] = []
+    lazy_constructors_ids: list[str] = []
     constructors: list[Constructor] = []
     constructors_ids: list[str] = []
 
@@ -274,8 +277,12 @@ def pytest_generate_tests(metafunc: pytest.Metafunc) -> None:
             eager_constructors_ids.append(constructor)
             constructors.append(EAGER_CONSTRUCTORS[constructor])
         elif constructor in {"pyspark", "pyspark[connect]"}:  # pragma: no cover
+            lazy_constructors.append(pyspark_lazy_constructor())
+            lazy_constructors_ids.append(constructor)
             constructors.append(pyspark_lazy_constructor())
         elif constructor in LAZY_CONSTRUCTORS:
+            lazy_constructors.append(LAZY_CONSTRUCTORS[constructor])
+            lazy_constructors_ids.append(constructor)
             constructors.append(LAZY_CONSTRUCTORS[constructor])
         else:  # pragma: no cover
             msg = f"Expected one of {EAGER_CONSTRUCTORS.keys()} or {LAZY_CONSTRUCTORS.keys()}, got {constructor}"
@@ -286,5 +293,9 @@ def pytest_generate_tests(metafunc: pytest.Metafunc) -> None:
         metafunc.parametrize(
             "constructor_eager", eager_constructors, ids=eager_constructors_ids
         )
+    elif "constructor_lazy" in metafunc.fixturenames:
+        metafunc.parametrize(
+            "constructor_lazy", lazy_constructors, ids=lazy_constructors_ids
+        )
     elif "constructor" in metafunc.fixturenames:
         metafunc.parametrize("constructor", constructors, ids=constructors_ids)
diff --git a/tests/frame/select_test.py b/tests/frame/select_test.py
@@ -15,6 +15,7 @@
 from tests.utils import POLARS_VERSION
 from tests.utils import Constructor
 from tests.utils import ConstructorEager
+from tests.utils import ConstructorLazy
 from tests.utils import assert_equal_data
 
 
@@ -83,48 +84,69 @@ def test_comparison_with_list_error_message() -> None:
         nw.from_native(pd.Series([[1, 2, 3]]), series_only=True) == [1, 2, 3]  # noqa: B015
 
 
-def test_missing_columns(
-    constructor: Constructor, request: pytest.FixtureRequest
-) -> None:
-    if ("pyspark" in str(constructor)) or "duckdb" in str(constructor):
-        request.applymarker(pytest.mark.xfail)
+def test_missing_columns_eager(constructor_eager: ConstructorEager) -> None:
     data = {"a": [1, 3, 2], "b": [4, 4, 6], "z": [7.0, 8.0, 9.0]}
-    df = nw.from_native(constructor(data))
+    df = nw.from_native(constructor_eager(data))
     selected_columns = ["a", "e", "f"]
     msg = (
         r"The following columns were not found: \[.*\]"
         r"\n\nHint: Did you mean one of these columns: \['a', 'b', 'z'\]?"
     )
-    if "polars" in str(constructor):
-        # In the lazy case, Polars only errors when we call `collect`,
-        # and we have no way to recover exactly which columns the user
-        # tried selecting. So, we just emit their message (which varies
-        # across versions...)
-        msg = "e|f"
-        if isinstance(df, nw.LazyFrame):
-            with pytest.raises(ColumnNotFoundError, match=msg):
-                df.select(selected_columns).collect()
-        else:
-            with pytest.raises(ColumnNotFoundError, match=msg):
-                df.select(selected_columns)
-        if POLARS_VERSION >= (1,):
-            # Old Polars versions wouldn't raise an error
-            # at all here
-            if isinstance(df, nw.LazyFrame):
-                with pytest.raises(ColumnNotFoundError, match=msg):
-                    df.drop(selected_columns, strict=True).collect()
-            else:
-                with pytest.raises(ColumnNotFoundError, match=msg):
-                    df.drop(selected_columns, strict=True)
-        else:  # pragma: no cover
-            pass
+    with pytest.raises(ColumnNotFoundError, match=msg):
+        df.select(selected_columns)
+    if "polars" in str(constructor_eager) and POLARS_VERSION < (1,):  # pragma: no cover
+        # Old Polars versions wouldn't raise an error at all here
+        pass
     else:
-        with pytest.raises(ColumnNotFoundError, match=msg):
-            df.select(selected_columns)
         with pytest.raises(ColumnNotFoundError, match=msg):
             df.drop(selected_columns, strict=True)
+    if "polars" in str(constructor_eager):
+        msg = r"\n\nHint: Did you mean one of these columns: \['a', 'b', 'z'\]?"
+    with pytest.raises(ColumnNotFoundError, match=msg):
+        df.select(nw.col("fdfa"))
+
+
+def test_missing_columns_lazy(
+    constructor_lazy: ConstructorLazy, request: pytest.FixtureRequest
+) -> None:
+    constructor_id = str(request.node.callspec.id)
+    if any(id_ == constructor_id for id_ in ("sqlframe", "pyspark[connect]")):
+        # These backend raise errors at collect
+        request.applymarker(pytest.mark.xfail)
+    data = {"a": [1, 3, 2], "b": [4, 4, 6], "z": [7.0, 8.0, 9.0]}
+    df = nw.from_native(constructor_lazy(data))
+    selected_columns = ["a", "e", "f"]
+
+    def maybe_collect(df: nw.LazyFrame[Any]) -> nw.DataFrame[Any] | nw.LazyFrame[Any]:
+        if constructor_id == "polars[lazy]":
+            # In the lazy case, Polars only errors when we call `collect`,
+            # and we have no way to recover exactly which columns the user
+            # tried selecting. So, we just emit their message (which varies
+            # across versions...)
+            return df.collect()
+        return df
+
+    if constructor_id == "polars[lazy]":
+        msg = r"^e|\"(e|f)\""
+    elif any(id_ == constructor_id for id_ in ("duckdb", "pyspark")):
+        msg = r"\n\nHint: Did you mean one of these columns: \['a', 'b', 'z'\]?"
+    else:
+        msg = (
+            r"The following columns were not found: \[.*\]"
+            r"\n\nHint: Did you mean one of these columns: \['a', 'b', 'z'\]?"
+        )
+    with pytest.raises(ColumnNotFoundError, match=msg):
+        maybe_collect(df.select(selected_columns))
+    if constructor_id == "polars[lazy]" and POLARS_VERSION < (1,):  # pragma: no cover
+        # Old Polars versions wouldn't raise an error at all here
+        pass
+    else:
         with pytest.raises(ColumnNotFoundError, match=msg):
-            df.select(nw.col("fdfa"))
+            maybe_collect(df.drop(selected_columns, strict=True))
+    if "polars" in str(constructor_lazy):
+        msg = r"^fdfa"
+    with pytest.raises(ColumnNotFoundError, match=msg):
+        maybe_collect(df.select(nw.col("fdfa")))
 
 
 def test_left_to_right_broadcasting(constructor: Constructor) -> None:

diff --git a/tests/utils.py b/tests/utils.py
@@ -46,6 +46,7 @@ def get_module_version_as_tuple(module_name: str) -> tuple[int, ...]:
 
 Constructor: TypeAlias = Callable[[Any], "NativeLazyFrame | NativeFrame | DataFrameLike"]
 ConstructorEager: TypeAlias = Callable[[Any], "NativeFrame | DataFrameLike"]
+ConstructorLazy: TypeAlias = Callable[[Any], "NativeLazyFrame"]
 
 
 def zip_strict(left: Sequence[Any], right: Sequence[Any]) -> Iterator[Any]: