From 6f7a574abcf9230c9a525441cd0f8100b784435c Mon Sep 17 00:00:00 2001 From: Edoardo Abati <29585319+EdAbati@users.noreply.github.com> Date: Sun, 4 May 2025 22:45:05 +0200 Subject: [PATCH 01/18] unify ColumnNotFound --- narwhals/_duckdb/expr.py | 12 ++++++++++-- narwhals/_spark_like/dataframe.py | 8 ++++++++ narwhals/_spark_like/expr.py | 10 +++++++++- tests/frame/select_test.py | 6 +----- 4 files changed, 28 insertions(+), 8 deletions(-) diff --git a/narwhals/_duckdb/expr.py b/narwhals/_duckdb/expr.py index 81723d52ee..c2e9dc0cec 100644 --- a/narwhals/_duckdb/expr.py +++ b/narwhals/_duckdb/expr.py @@ -9,6 +9,7 @@ from typing import Sequence from typing import cast +import duckdb from duckdb import CoalesceOperator from duckdb import FunctionExpression from duckdb import StarExpression @@ -28,12 +29,12 @@ from narwhals._duckdb.utils import narwhals_to_native_dtype from narwhals._duckdb.utils import when from narwhals._expression_parsing import ExprKind +from narwhals.exceptions import ColumnNotFoundError from narwhals.utils import Implementation from narwhals.utils import not_implemented from narwhals.utils import requires if TYPE_CHECKING: - import duckdb from typing_extensions import Self from narwhals._compliant.typing import AliasNames @@ -186,7 +187,14 @@ def from_column_names( context: _FullContext, ) -> Self: def func(df: DuckDBLazyFrame) -> list[duckdb.Expression]: - return [col(name) for name in evaluate_column_names(df)] + col_names = evaluate_column_names(df) + missing_columns = [c for c in col_names if c not in df.columns] + if missing_columns: + raise ColumnNotFoundError.from_missing_and_available_column_names( + missing_columns=missing_columns, + available_columns=df.columns, + ) + return [col(name) for name in col_names] return cls( func, diff --git a/narwhals/_spark_like/dataframe.py b/narwhals/_spark_like/dataframe.py index e3f6fea425..64ebcc75bd 100644 --- a/narwhals/_spark_like/dataframe.py +++ b/narwhals/_spark_like/dataframe.py @@ -15,6 +15,7 @@ from narwhals._spark_like.utils import import_native_dtypes from narwhals._spark_like.utils import import_window from narwhals._spark_like.utils import native_to_narwhals_dtype +from narwhals.exceptions import ColumnNotFoundError from narwhals.exceptions import InvalidOperationError from narwhals.typing import CompliantLazyFrame from narwhals.utils import Implementation @@ -257,6 +258,13 @@ def collect( raise ValueError(msg) # pragma: no cover def simple_select(self, *column_names: str) -> Self: + df_columns = self.columns + missing_columns = [c for c in column_names if c not in df_columns] + if missing_columns: + raise ColumnNotFoundError.from_missing_and_available_column_names( + missing_columns=missing_columns, + available_columns=df_columns, + ) return self._with_native(self.native.select(*column_names)) def aggregate( diff --git a/narwhals/_spark_like/expr.py b/narwhals/_spark_like/expr.py index d33b0e1662..ce27ea6e3c 100644 --- a/narwhals/_spark_like/expr.py +++ b/narwhals/_spark_like/expr.py @@ -20,6 +20,7 @@ from narwhals._spark_like.utils import import_window from narwhals._spark_like.utils import narwhals_to_native_dtype from narwhals.dependencies import get_pyspark +from narwhals.exceptions import ColumnNotFoundError from narwhals.utils import Implementation from narwhals.utils import not_implemented from narwhals.utils import parse_version @@ -224,7 +225,14 @@ def from_column_names( context: _FullContext, ) -> Self: def func(df: SparkLikeLazyFrame) -> list[Column]: - return [df._F.col(col_name) for col_name in evaluate_column_names(df)] + col_names = evaluate_column_names(df) + missing_columns = [c for c in col_names if c not in df.columns] + if missing_columns: + raise ColumnNotFoundError.from_missing_and_available_column_names( + missing_columns=missing_columns, + available_columns=df.columns, + ) + return [df._F.col(col_name) for col_name in col_names] return cls( func, diff --git a/tests/frame/select_test.py b/tests/frame/select_test.py index 010fd7bfe3..14b6a8d0a3 100644 --- a/tests/frame/select_test.py +++ b/tests/frame/select_test.py @@ -83,11 +83,7 @@ def test_comparison_with_list_error_message() -> None: nw.from_native(pd.Series([[1, 2, 3]]), series_only=True) == [1, 2, 3] # noqa: B015 -def test_missing_columns( - constructor: Constructor, request: pytest.FixtureRequest -) -> None: - if ("pyspark" in str(constructor)) or "duckdb" in str(constructor): - request.applymarker(pytest.mark.xfail) +def test_missing_columns(constructor: Constructor) -> None: data = {"a": [1, 3, 2], "b": [4, 4, 6], "z": [7.0, 8.0, 9.0]} df = nw.from_native(constructor(data)) selected_columns = ["a", "e", "f"] From 8fe45e682d4667077ffce28dc0dc60e56f9ee574 Mon Sep 17 00:00:00 2001 From: Edoardo Abati <29585319+EdAbati@users.noreply.github.com> Date: Sun, 4 May 2025 22:51:20 +0200 Subject: [PATCH 02/18] revert --- narwhals/_duckdb/expr.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/narwhals/_duckdb/expr.py b/narwhals/_duckdb/expr.py index c2e9dc0cec..dd920e6158 100644 --- a/narwhals/_duckdb/expr.py +++ b/narwhals/_duckdb/expr.py @@ -9,7 +9,6 @@ from typing import Sequence from typing import cast -import duckdb from duckdb import CoalesceOperator from duckdb import FunctionExpression from duckdb import StarExpression @@ -35,6 +34,7 @@ from narwhals.utils import requires if TYPE_CHECKING: + import duckdb from typing_extensions import Self from narwhals._compliant.typing import AliasNames From 6d58bc215b0bae059ad920f15584034aa3f95010 Mon Sep 17 00:00:00 2001 From: Edoardo Abati <29585319+EdAbati@users.noreply.github.com> Date: Tue, 6 May 2025 08:03:16 +0200 Subject: [PATCH 03/18] try except during select --- narwhals/_duckdb/dataframe.py | 6 +++++- narwhals/_duckdb/expr.py | 10 +--------- narwhals/_spark_like/dataframe.py | 15 +++++++-------- narwhals/_spark_like/expr.py | 10 +--------- tests/frame/select_test.py | 18 +++++++++++++----- 5 files changed, 27 insertions(+), 32 deletions(-) diff --git a/narwhals/_duckdb/dataframe.py b/narwhals/_duckdb/dataframe.py index c1e34d26e2..d525e2fe1c 100644 --- a/narwhals/_duckdb/dataframe.py +++ b/narwhals/_duckdb/dataframe.py @@ -184,7 +184,11 @@ def select( *exprs: DuckDBExpr, ) -> Self: selection = (val.alias(name) for name, val in evaluate_exprs(self, *exprs)) - return self._with_native(self.native.select(*selection)) + try: + return self._with_native(self.native.select(*selection)) + except duckdb.BinderException as e: + msg = f"Selected columns not found in the DataFrame.\n\nHint: Did you mean one of these columns: {self.columns}?" + raise ColumnNotFoundError(msg) from e def drop(self, columns: Sequence[str], *, strict: bool) -> Self: columns_to_drop = parse_columns_to_drop(self, columns=columns, strict=strict) diff --git a/narwhals/_duckdb/expr.py b/narwhals/_duckdb/expr.py index dd920e6158..81723d52ee 100644 --- a/narwhals/_duckdb/expr.py +++ b/narwhals/_duckdb/expr.py @@ -28,7 +28,6 @@ from narwhals._duckdb.utils import narwhals_to_native_dtype from narwhals._duckdb.utils import when from narwhals._expression_parsing import ExprKind -from narwhals.exceptions import ColumnNotFoundError from narwhals.utils import Implementation from narwhals.utils import not_implemented from narwhals.utils import requires @@ -187,14 +186,7 @@ def from_column_names( context: _FullContext, ) -> Self: def func(df: DuckDBLazyFrame) -> list[duckdb.Expression]: - col_names = evaluate_column_names(df) - missing_columns = [c for c in col_names if c not in df.columns] - if missing_columns: - raise ColumnNotFoundError.from_missing_and_available_column_names( - missing_columns=missing_columns, - available_columns=df.columns, - ) - return [col(name) for name in col_names] + return [col(name) for name in evaluate_column_names(df)] return cls( func, diff --git a/narwhals/_spark_like/dataframe.py b/narwhals/_spark_like/dataframe.py index 64ebcc75bd..c07f75e38b 100644 --- a/narwhals/_spark_like/dataframe.py +++ b/narwhals/_spark_like/dataframe.py @@ -258,13 +258,6 @@ def collect( raise ValueError(msg) # pragma: no cover def simple_select(self, *column_names: str) -> Self: - df_columns = self.columns - missing_columns = [c for c in column_names if c not in df_columns] - if missing_columns: - raise ColumnNotFoundError.from_missing_and_available_column_names( - missing_columns=missing_columns, - available_columns=df_columns, - ) return self._with_native(self.native.select(*column_names)) def aggregate( @@ -282,7 +275,13 @@ def select( ) -> Self: new_columns = evaluate_exprs(self, *exprs) new_columns_list = [col.alias(col_name) for (col_name, col) in new_columns] - return self._with_native(self.native.select(*new_columns_list)) + if self._implementation.is_sqlframe(): + return self._with_native(self.native.select(*new_columns_list)) + try: + return self._with_native(self.native.select(*new_columns_list)) + except Exception as e: + msg = f"Selected columns not found in the DataFrame.\n\nHint: Did you mean one of these columns: {self.columns}?" + raise ColumnNotFoundError(msg) from e def with_columns(self, *exprs: SparkLikeExpr) -> Self: new_columns = evaluate_exprs(self, *exprs) diff --git a/narwhals/_spark_like/expr.py b/narwhals/_spark_like/expr.py index ce27ea6e3c..d33b0e1662 100644 --- a/narwhals/_spark_like/expr.py +++ b/narwhals/_spark_like/expr.py @@ -20,7 +20,6 @@ from narwhals._spark_like.utils import import_window from narwhals._spark_like.utils import narwhals_to_native_dtype from narwhals.dependencies import get_pyspark -from narwhals.exceptions import ColumnNotFoundError from narwhals.utils import Implementation from narwhals.utils import not_implemented from narwhals.utils import parse_version @@ -225,14 +224,7 @@ def from_column_names( context: _FullContext, ) -> Self: def func(df: SparkLikeLazyFrame) -> list[Column]: - col_names = evaluate_column_names(df) - missing_columns = [c for c in col_names if c not in df.columns] - if missing_columns: - raise ColumnNotFoundError.from_missing_and_available_column_names( - missing_columns=missing_columns, - available_columns=df.columns, - ) - return [df._F.col(col_name) for col_name in col_names] + return [df._F.col(col_name) for col_name in evaluate_column_names(df)] return cls( func, diff --git a/tests/frame/select_test.py b/tests/frame/select_test.py index 14b6a8d0a3..462ead4df4 100644 --- a/tests/frame/select_test.py +++ b/tests/frame/select_test.py @@ -83,14 +83,22 @@ def test_comparison_with_list_error_message() -> None: nw.from_native(pd.Series([[1, 2, 3]]), series_only=True) == [1, 2, 3] # noqa: B015 -def test_missing_columns(constructor: Constructor) -> None: +def test_missing_columns( + constructor: Constructor, request: pytest.FixtureRequest +) -> None: + if "sqlframe" in str(constructor): + request.applymarker(pytest.mark.xfail(reason="sqlframe doesn't raise this error")) data = {"a": [1, 3, 2], "b": [4, 4, 6], "z": [7.0, 8.0, 9.0]} df = nw.from_native(constructor(data)) selected_columns = ["a", "e", "f"] - msg = ( - r"The following columns were not found: \[.*\]" - r"\n\nHint: Did you mean one of these columns: \['a', 'b', 'z'\]?" - ) + if any(x in str(constructor) for x in ("duckdb", "pyspark")): + msg = r"\n\nHint: Did you mean one of these columns: \['a', 'b', 'z'\]?" + else: + msg = ( + r"The following columns were not found: \[.*\]" + r"\n\nHint: Did you mean one of these columns: \['a', 'b', 'z'\]?" + ) + if "polars" in str(constructor): # In the lazy case, Polars only errors when we call `collect`, # and we have no way to recover exactly which columns the user From be834b3aa859d22d277a67e493f1d6ca870daf27 Mon Sep 17 00:00:00 2001 From: Edoardo Abati <29585319+EdAbati@users.noreply.github.com> Date: Tue, 6 May 2025 08:06:56 +0200 Subject: [PATCH 04/18] catch correct exception --- narwhals/_spark_like/dataframe.py | 16 +++++++++------- 1 file changed, 9 insertions(+), 7 deletions(-) diff --git a/narwhals/_spark_like/dataframe.py b/narwhals/_spark_like/dataframe.py index c07f75e38b..177e81e13a 100644 --- a/narwhals/_spark_like/dataframe.py +++ b/narwhals/_spark_like/dataframe.py @@ -275,13 +275,15 @@ def select( ) -> Self: new_columns = evaluate_exprs(self, *exprs) new_columns_list = [col.alias(col_name) for (col_name, col) in new_columns] - if self._implementation.is_sqlframe(): - return self._with_native(self.native.select(*new_columns_list)) - try: - return self._with_native(self.native.select(*new_columns_list)) - except Exception as e: - msg = f"Selected columns not found in the DataFrame.\n\nHint: Did you mean one of these columns: {self.columns}?" - raise ColumnNotFoundError(msg) from e + if not self._implementation.is_sqlframe(): + from pyspark.errors import AnalysisException + + try: + return self._with_native(self.native.select(*new_columns_list)) + except AnalysisException as e: + msg = f"Selected columns not found in the DataFrame.\n\nHint: Did you mean one of these columns: {self.columns}?" + raise ColumnNotFoundError(msg) from e + return self._with_native(self.native.select(*new_columns_list)) def with_columns(self, *exprs: SparkLikeExpr) -> Self: new_columns = evaluate_exprs(self, *exprs) From 19d6e24488eef231292882adc0632f4bdfad0ad4 Mon Sep 17 00:00:00 2001 From: Edoardo Abati <29585319+EdAbati@users.noreply.github.com> Date: Tue, 6 May 2025 08:13:46 +0200 Subject: [PATCH 05/18] coverage --- narwhals/_spark_like/dataframe.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/narwhals/_spark_like/dataframe.py b/narwhals/_spark_like/dataframe.py index 177e81e13a..e284d62d9d 100644 --- a/narwhals/_spark_like/dataframe.py +++ b/narwhals/_spark_like/dataframe.py @@ -275,7 +275,7 @@ def select( ) -> Self: new_columns = evaluate_exprs(self, *exprs) new_columns_list = [col.alias(col_name) for (col_name, col) in new_columns] - if not self._implementation.is_sqlframe(): + if not self._implementation.is_sqlframe(): # pragma: no cover from pyspark.errors import AnalysisException try: From 917d0730d2908745c76558b09702e5074c870a4c Mon Sep 17 00:00:00 2001 From: Edoardo Abati <29585319+EdAbati@users.noreply.github.com> Date: Fri, 9 May 2025 11:55:17 +0200 Subject: [PATCH 06/18] separate lazy and eager tests --- narwhals/_spark_like/dataframe.py | 2 +- tests/conftest.py | 13 +++++- tests/frame/select_test.py | 76 +++++++++++++++++-------------- tests/utils.py | 1 + 4 files changed, 55 insertions(+), 37 deletions(-) diff --git a/narwhals/_spark_like/dataframe.py b/narwhals/_spark_like/dataframe.py index d20eda8fd2..938d479a81 100644 --- a/narwhals/_spark_like/dataframe.py +++ b/narwhals/_spark_like/dataframe.py @@ -275,7 +275,7 @@ def select( ) -> Self: new_columns = evaluate_exprs(self, *exprs) new_columns_list = [col.alias(col_name) for (col_name, col) in new_columns] - if not self._implementation.is_sqlframe(): # pragma: no cover + if self._implementation.is_pyspark(): # pragma: no cover from pyspark.errors import AnalysisException try: diff --git a/tests/conftest.py b/tests/conftest.py index a7d1dc850f..ac92782a07 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -27,6 +27,7 @@ from narwhals.typing import NativeLazyFrame from tests.utils import Constructor from tests.utils import ConstructorEager + from tests.utils import ConstructorLazy Data: TypeAlias = "dict[str, list[Any]]" @@ -227,7 +228,7 @@ def sqlframe_pyspark_lazy_constructor(obj: Data) -> SQLFrameDataFrame: # pragma "cudf": cudf_constructor, "polars[eager]": polars_eager_constructor, } -LAZY_CONSTRUCTORS: dict[str, Constructor] = { +LAZY_CONSTRUCTORS: dict[str, ConstructorLazy] = { "dask": dask_lazy_p2_constructor, "polars[lazy]": polars_lazy_constructor, "duckdb": duckdb_lazy_constructor, @@ -259,6 +260,8 @@ def pytest_generate_tests(metafunc: pytest.Metafunc) -> None: eager_constructors: list[ConstructorEager] = [] eager_constructors_ids: list[str] = [] + lazy_constructors: list[ConstructorLazy] = [] + lazy_constructors_ids: list[str] = [] constructors: list[Constructor] = [] constructors_ids: list[str] = [] @@ -274,8 +277,12 @@ def pytest_generate_tests(metafunc: pytest.Metafunc) -> None: eager_constructors_ids.append(constructor) constructors.append(EAGER_CONSTRUCTORS[constructor]) elif constructor in {"pyspark", "pyspark[connect]"}: # pragma: no cover + lazy_constructors.append(pyspark_lazy_constructor()) + lazy_constructors_ids.append(constructor) constructors.append(pyspark_lazy_constructor()) elif constructor in LAZY_CONSTRUCTORS: + lazy_constructors.append(LAZY_CONSTRUCTORS[constructor]) + lazy_constructors_ids.append(constructor) constructors.append(LAZY_CONSTRUCTORS[constructor]) else: # pragma: no cover msg = f"Expected one of {EAGER_CONSTRUCTORS.keys()} or {LAZY_CONSTRUCTORS.keys()}, got {constructor}" @@ -286,5 +293,9 @@ def pytest_generate_tests(metafunc: pytest.Metafunc) -> None: metafunc.parametrize( "constructor_eager", eager_constructors, ids=eager_constructors_ids ) + elif "constructor_lazy" in metafunc.fixturenames: + metafunc.parametrize( + "constructor_lazy", lazy_constructors, ids=lazy_constructors_ids + ) elif "constructor" in metafunc.fixturenames: metafunc.parametrize("constructor", constructors, ids=constructors_ids) diff --git a/tests/frame/select_test.py b/tests/frame/select_test.py index 462ead4df4..926011b569 100644 --- a/tests/frame/select_test.py +++ b/tests/frame/select_test.py @@ -15,6 +15,7 @@ from tests.utils import POLARS_VERSION from tests.utils import Constructor from tests.utils import ConstructorEager +from tests.utils import ConstructorLazy from tests.utils import assert_equal_data @@ -83,52 +84,57 @@ def test_comparison_with_list_error_message() -> None: nw.from_native(pd.Series([[1, 2, 3]]), series_only=True) == [1, 2, 3] # noqa: B015 -def test_missing_columns( - constructor: Constructor, request: pytest.FixtureRequest +def test_missing_columns_eager(constructor_eager: ConstructorEager) -> None: + data = {"a": [1, 3, 2], "b": [4, 4, 6], "z": [7.0, 8.0, 9.0]} + df = nw.from_native(constructor_eager(data)) + selected_columns = ["a", "e", "f"] + msg = ( + r"The following columns were not found: \[.*\]" + r"\n\nHint: Did you mean one of these columns: \['a', 'b', 'z'\]?" + ) + with pytest.raises(ColumnNotFoundError, match=msg): + df.select(selected_columns) + if "polars" in str(constructor_eager) and POLARS_VERSION < (1,): + # Old Polars versions wouldn't raise an error at all here + pass + else: + with pytest.raises(ColumnNotFoundError, match=msg): + df.drop(selected_columns, strict=True) + if "polars" in str(constructor_eager): + msg = r"\n\nHint: Did you mean one of these columns: \['a', 'b', 'z'\]?" + with pytest.raises(ColumnNotFoundError, match=msg): + df.select(nw.col("fdfa")) + + +def test_missing_columns_lazy( + constructor_lazy: ConstructorLazy, request: pytest.FixtureRequest ) -> None: - if "sqlframe" in str(constructor): - request.applymarker(pytest.mark.xfail(reason="sqlframe doesn't raise this error")) + if any(x in str(constructor_lazy) for x in ("sqlframe", "pyspark[connect]")): + request.applymarker(pytest.mark.xfail) data = {"a": [1, 3, 2], "b": [4, 4, 6], "z": [7.0, 8.0, 9.0]} - df = nw.from_native(constructor(data)) + df = nw.from_native(constructor_lazy(data)) selected_columns = ["a", "e", "f"] - if any(x in str(constructor) for x in ("duckdb", "pyspark")): + if "polars" in str(constructor_lazy): + msg = r"^e|\"(e|f)\"" + elif any(x in str(constructor_lazy) for x in ("duckdb", "pyspark")): msg = r"\n\nHint: Did you mean one of these columns: \['a', 'b', 'z'\]?" else: msg = ( r"The following columns were not found: \[.*\]" r"\n\nHint: Did you mean one of these columns: \['a', 'b', 'z'\]?" ) - - if "polars" in str(constructor): - # In the lazy case, Polars only errors when we call `collect`, - # and we have no way to recover exactly which columns the user - # tried selecting. So, we just emit their message (which varies - # across versions...) - msg = "e|f" - if isinstance(df, nw.LazyFrame): - with pytest.raises(ColumnNotFoundError, match=msg): - df.select(selected_columns).collect() - else: - with pytest.raises(ColumnNotFoundError, match=msg): - df.select(selected_columns) - if POLARS_VERSION >= (1,): - # Old Polars versions wouldn't raise an error - # at all here - if isinstance(df, nw.LazyFrame): - with pytest.raises(ColumnNotFoundError, match=msg): - df.drop(selected_columns, strict=True).collect() - else: - with pytest.raises(ColumnNotFoundError, match=msg): - df.drop(selected_columns, strict=True) - else: # pragma: no cover - pass + with pytest.raises(ColumnNotFoundError, match=msg): + df.select(selected_columns).collect() + if "polars" in str(constructor_lazy) and POLARS_VERSION < (1,): + # Old Polars versions wouldn't raise an error at all here + pass else: with pytest.raises(ColumnNotFoundError, match=msg): - df.select(selected_columns) - with pytest.raises(ColumnNotFoundError, match=msg): - df.drop(selected_columns, strict=True) - with pytest.raises(ColumnNotFoundError, match=msg): - df.select(nw.col("fdfa")) + df.drop(selected_columns, strict=True).collect() + if "polars" in str(constructor_lazy): + msg = r"^fdfa" + with pytest.raises(ColumnNotFoundError, match=msg): + df.select(nw.col("fdfa")).collect() def test_left_to_right_broadcasting(constructor: Constructor) -> None: diff --git a/tests/utils.py b/tests/utils.py index 4469dcf0fe..f311a54cd9 100644 --- a/tests/utils.py +++ b/tests/utils.py @@ -46,6 +46,7 @@ def get_module_version_as_tuple(module_name: str) -> tuple[int, ...]: Constructor: TypeAlias = Callable[[Any], "NativeLazyFrame | NativeFrame | DataFrameLike"] ConstructorEager: TypeAlias = Callable[[Any], "NativeFrame | DataFrameLike"] +ConstructorLazy: TypeAlias = Callable[[Any], "NativeLazyFrame"] def zip_strict(left: Sequence[Any], right: Sequence[Any]) -> Iterator[Any]: From 5d2972d0430d064f5c5555b5b3da85f8d25f8e97 Mon Sep 17 00:00:00 2001 From: Edoardo Abati <29585319+EdAbati@users.noreply.github.com> Date: Fri, 9 May 2025 13:32:05 +0200 Subject: [PATCH 07/18] coverage --- tests/frame/select_test.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/tests/frame/select_test.py b/tests/frame/select_test.py index 926011b569..d235ed59e6 100644 --- a/tests/frame/select_test.py +++ b/tests/frame/select_test.py @@ -94,7 +94,7 @@ def test_missing_columns_eager(constructor_eager: ConstructorEager) -> None: ) with pytest.raises(ColumnNotFoundError, match=msg): df.select(selected_columns) - if "polars" in str(constructor_eager) and POLARS_VERSION < (1,): + if "polars" in str(constructor_eager) and POLARS_VERSION < (1,): # pragma: no cover # Old Polars versions wouldn't raise an error at all here pass else: @@ -125,7 +125,7 @@ def test_missing_columns_lazy( ) with pytest.raises(ColumnNotFoundError, match=msg): df.select(selected_columns).collect() - if "polars" in str(constructor_lazy) and POLARS_VERSION < (1,): + if "polars" in str(constructor_lazy) and POLARS_VERSION < (1,): # pragma: no cover # Old Polars versions wouldn't raise an error at all here pass else: From 40801014dfcbb8b3c694e98030e69e2a49f6943d Mon Sep 17 00:00:00 2001 From: Edoardo Abati <29585319+EdAbati@users.noreply.github.com> Date: Fri, 9 May 2025 13:44:18 +0200 Subject: [PATCH 08/18] cleanup exception --- narwhals/_duckdb/dataframe.py | 3 +-- narwhals/_spark_like/dataframe.py | 3 +-- narwhals/exceptions.py | 10 ++++++++++ 3 files changed, 12 insertions(+), 4 deletions(-) diff --git a/narwhals/_duckdb/dataframe.py b/narwhals/_duckdb/dataframe.py index d525e2fe1c..48338170b0 100644 --- a/narwhals/_duckdb/dataframe.py +++ b/narwhals/_duckdb/dataframe.py @@ -187,8 +187,7 @@ def select( try: return self._with_native(self.native.select(*selection)) except duckdb.BinderException as e: - msg = f"Selected columns not found in the DataFrame.\n\nHint: Did you mean one of these columns: {self.columns}?" - raise ColumnNotFoundError(msg) from e + raise ColumnNotFoundError.from_available_column_names(self.columns) from e def drop(self, columns: Sequence[str], *, strict: bool) -> Self: columns_to_drop = parse_columns_to_drop(self, columns=columns, strict=strict) diff --git a/narwhals/_spark_like/dataframe.py b/narwhals/_spark_like/dataframe.py index 938d479a81..4a1b0f516f 100644 --- a/narwhals/_spark_like/dataframe.py +++ b/narwhals/_spark_like/dataframe.py @@ -281,8 +281,7 @@ def select( try: return self._with_native(self.native.select(*new_columns_list)) except AnalysisException as e: - msg = f"Selected columns not found in the DataFrame.\n\nHint: Did you mean one of these columns: {self.columns}?" - raise ColumnNotFoundError(msg) from e + raise ColumnNotFoundError.from_available_column_names(self.columns) from e return self._with_native(self.native.select(*new_columns_list)) def with_columns(self, *exprs: SparkLikeExpr) -> Self: diff --git a/narwhals/exceptions.py b/narwhals/exceptions.py index 6655d9e55e..ebd7108b09 100644 --- a/narwhals/exceptions.py +++ b/narwhals/exceptions.py @@ -38,6 +38,16 @@ def from_missing_and_available_column_names( ) return ColumnNotFoundError(message) + @classmethod + def from_available_column_names( + cls: type, available_columns: list[str] + ) -> ColumnNotFoundError: + message = ( + "The selected columns were not found." + f"\n\nHint: Did you mean one of these columns: {available_columns}?" + ) + return ColumnNotFoundError(message) + class ComputeError(NarwhalsError): """Exception raised when the underlying computation could not be evaluated.""" From 844259667cc6e47f847b5f930b20408125a7c03b Mon Sep 17 00:00:00 2001 From: Edoardo Abati <29585319+EdAbati@users.noreply.github.com> Date: Fri, 9 May 2025 13:58:24 +0200 Subject: [PATCH 09/18] use constructor_id --- tests/frame/select_test.py | 25 ++++++++++++++++++------- 1 file changed, 18 insertions(+), 7 deletions(-) diff --git a/tests/frame/select_test.py b/tests/frame/select_test.py index d235ed59e6..6f7899c8ba 100644 --- a/tests/frame/select_test.py +++ b/tests/frame/select_test.py @@ -109,14 +109,25 @@ def test_missing_columns_eager(constructor_eager: ConstructorEager) -> None: def test_missing_columns_lazy( constructor_lazy: ConstructorLazy, request: pytest.FixtureRequest ) -> None: - if any(x in str(constructor_lazy) for x in ("sqlframe", "pyspark[connect]")): + constructor_id = str(request.node.callspec.id) + if any(id_ == constructor_id for id_ in ("sqlframe", "pyspark[connect]")): request.applymarker(pytest.mark.xfail) data = {"a": [1, 3, 2], "b": [4, 4, 6], "z": [7.0, 8.0, 9.0]} df = nw.from_native(constructor_lazy(data)) selected_columns = ["a", "e", "f"] - if "polars" in str(constructor_lazy): + + def maybe_collect(df: nw.LazyFrame[Any]) -> nw.DataFrame[Any] | nw.LazyFrame[Any]: + if constructor_id == "polars[lazy]": + # In the lazy case, Polars only errors when we call `collect`, + # and we have no way to recover exactly which columns the user + # tried selecting. So, we just emit their message (which varies + # across versions...) + return df.collect() + return df + + if constructor_id == "polars[lazy]": msg = r"^e|\"(e|f)\"" - elif any(x in str(constructor_lazy) for x in ("duckdb", "pyspark")): + elif any(id_ == constructor_id for id_ in ("duckdb", "pyspark")): msg = r"\n\nHint: Did you mean one of these columns: \['a', 'b', 'z'\]?" else: msg = ( @@ -124,17 +135,17 @@ def test_missing_columns_lazy( r"\n\nHint: Did you mean one of these columns: \['a', 'b', 'z'\]?" ) with pytest.raises(ColumnNotFoundError, match=msg): - df.select(selected_columns).collect() - if "polars" in str(constructor_lazy) and POLARS_VERSION < (1,): # pragma: no cover + maybe_collect(df.select(selected_columns)) + if constructor_id == "polars[lazy]" and POLARS_VERSION < (1,): # pragma: no cover # Old Polars versions wouldn't raise an error at all here pass else: with pytest.raises(ColumnNotFoundError, match=msg): - df.drop(selected_columns, strict=True).collect() + maybe_collect(df.drop(selected_columns, strict=True)) if "polars" in str(constructor_lazy): msg = r"^fdfa" with pytest.raises(ColumnNotFoundError, match=msg): - df.select(nw.col("fdfa")).collect() + maybe_collect(df.select(nw.col("fdfa"))) def test_left_to_right_broadcasting(constructor: Constructor) -> None: From 937a1234f46e363c03f1efedbf12dffd0b912f37 Mon Sep 17 00:00:00 2001 From: Edoardo Abati <29585319+EdAbati@users.noreply.github.com> Date: Fri, 9 May 2025 14:36:05 +0200 Subject: [PATCH 10/18] what is going on in pyspark connect? --- tests/frame/select_test.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/tests/frame/select_test.py b/tests/frame/select_test.py index 6f7899c8ba..ad15d7bfb2 100644 --- a/tests/frame/select_test.py +++ b/tests/frame/select_test.py @@ -110,7 +110,8 @@ def test_missing_columns_lazy( constructor_lazy: ConstructorLazy, request: pytest.FixtureRequest ) -> None: constructor_id = str(request.node.callspec.id) - if any(id_ == constructor_id for id_ in ("sqlframe", "pyspark[connect]")): + # if any(id_ == constructor_id for id_ in ("sqlframe", "pyspark[connect]")): + if constructor_id == "sqlframe": request.applymarker(pytest.mark.xfail) data = {"a": [1, 3, 2], "b": [4, 4, 6], "z": [7.0, 8.0, 9.0]} df = nw.from_native(constructor_lazy(data)) From 0544f30ed1b5d134f51a46caf9d31ba9b5798185 Mon Sep 17 00:00:00 2001 From: Edoardo Abati <29585319+EdAbati@users.noreply.github.com> Date: Fri, 9 May 2025 14:48:27 +0200 Subject: [PATCH 11/18] ignore pyspark connect --- tests/frame/select_test.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/tests/frame/select_test.py b/tests/frame/select_test.py index ad15d7bfb2..72bfcf9cc5 100644 --- a/tests/frame/select_test.py +++ b/tests/frame/select_test.py @@ -110,8 +110,8 @@ def test_missing_columns_lazy( constructor_lazy: ConstructorLazy, request: pytest.FixtureRequest ) -> None: constructor_id = str(request.node.callspec.id) - # if any(id_ == constructor_id for id_ in ("sqlframe", "pyspark[connect]")): - if constructor_id == "sqlframe": + if any(id_ == constructor_id for id_ in ("sqlframe", "pyspark[connect]")): + # These backend raise errors at collect request.applymarker(pytest.mark.xfail) data = {"a": [1, 3, 2], "b": [4, 4, 6], "z": [7.0, 8.0, 9.0]} df = nw.from_native(constructor_lazy(data)) From b518a5a0036ab935ff6c02eeb9e4d3a941dacf0e Mon Sep 17 00:00:00 2001 From: Edoardo Abati <29585319+EdAbati@users.noreply.github.com> Date: Tue, 13 May 2025 07:55:56 +0200 Subject: [PATCH 12/18] added missing column tests --- tests/frame/filter_test.py | 24 ++++++++++++++++++++++++ tests/frame/with_columns_test.py | 25 +++++++++++++++++++++++++ 2 files changed, 49 insertions(+) diff --git a/tests/frame/filter_test.py b/tests/frame/filter_test.py index b69211241b..a4876a5f95 100644 --- a/tests/frame/filter_test.py +++ b/tests/frame/filter_test.py @@ -5,6 +5,7 @@ import pytest import narwhals as nw +from narwhals.exceptions import ColumnNotFoundError from narwhals.exceptions import LengthChangingExprError from narwhals.exceptions import ShapeError from tests.utils import Constructor @@ -68,3 +69,26 @@ def test_filter_with_constrains(constructor: Constructor) -> None: expected_expr = {"a": [1, 2], "b": [4, 6]} assert_equal_data(result_expr, expected_expr) + + +def test_filter_missing_column( + constructor: Constructor, request: pytest.FixtureRequest +) -> None: + if any(x in str(constructor) for x in ("pyspark", "duckdb", "sqlframe")): + request.applymarker(pytest.mark.xfail) + data = {"a": [1, 2], "b": [3, 4]} + df = nw.from_native(constructor(data)) + + msg = ( + r"The following columns were not found: \[.*\]" + r"\n\nHint: Did you mean one of these columns: \['a', 'b'\]?" + ) + if "polars" in str(constructor): + msg = r"^unable to find column \"c\"; valid columns: \[\"a\", \"b\"\]" + + if "polars_lazy" in str(constructor) and isinstance(df, nw.LazyFrame): + with pytest.raises(ColumnNotFoundError, match=msg): + df.filter(c=5).collect() + else: + with pytest.raises(ColumnNotFoundError, match=msg): + df.filter(c=5) diff --git a/tests/frame/with_columns_test.py b/tests/frame/with_columns_test.py index 4fcbfb4015..67383ce37c 100644 --- a/tests/frame/with_columns_test.py +++ b/tests/frame/with_columns_test.py @@ -5,6 +5,7 @@ import pytest import narwhals as nw +from narwhals.exceptions import ColumnNotFoundError from narwhals.exceptions import ShapeError from tests.utils import PYARROW_VERSION from tests.utils import Constructor @@ -78,3 +79,27 @@ def test_with_columns_series_shape_mismatch(constructor_eager: ConstructorEager) ] with pytest.raises(ShapeError): df1.with_columns(second=second) + + +def test_with_columns_missing_column( + constructor: Constructor, request: pytest.FixtureRequest +) -> None: + if any(x in str(constructor) for x in ("pyspark", "duckdb", "sqlframe")): + request.applymarker(pytest.mark.xfail) + data = {"a": [1, 2], "b": [3, 4]} + df = nw.from_native(constructor(data)) + + msg = ( + r"The following columns were not found: \[.*\]" + r"\n\nHint: Did you mean one of these columns: \['a', 'b'\]?" + ) + if "polars" in str(constructor): + msg = r"^c\n\n" + + if "polars_lazy" in str(constructor) and isinstance(df, nw.LazyFrame): + # In the lazy case, Polars only errors when we call `collect` + with pytest.raises(ColumnNotFoundError, match=msg): + df.with_columns(d=nw.col("c") + 1).collect() + else: + with pytest.raises(ColumnNotFoundError, match=msg): + df.with_columns(d=nw.col("c") + 1) From bb7254f614b1f65801dafb46d6298b5b62f16486 Mon Sep 17 00:00:00 2001 From: Edoardo Abati <29585319+EdAbati@users.noreply.github.com> Date: Tue, 13 May 2025 08:07:28 +0200 Subject: [PATCH 13/18] move to drop test --- tests/frame/drop_test.py | 30 +++++++++++++++--------------- tests/frame/select_test.py | 8 +------- 2 files changed, 16 insertions(+), 22 deletions(-) diff --git a/tests/frame/drop_test.py b/tests/frame/drop_test.py index 91a360b0b0..02b88de3eb 100644 --- a/tests/frame/drop_test.py +++ b/tests/frame/drop_test.py @@ -30,22 +30,9 @@ def test_drop(constructor: Constructor, to_drop: list[str], expected: list[str]) assert df.drop(*to_drop).collect_schema().names() == expected -@pytest.mark.parametrize( - ("strict", "context"), - [ - ( - True, - pytest.raises(ColumnNotFoundError, match="z"), - ), - (False, does_not_raise()), - ], -) +@pytest.mark.parametrize("strict", [True, False]) def test_drop_strict( - request: pytest.FixtureRequest, - constructor: Constructor, - context: Any, - *, - strict: bool, + request: pytest.FixtureRequest, constructor: Constructor, *, strict: bool ) -> None: if "polars_lazy" in str(constructor) and POLARS_VERSION < (1, 0, 0) and strict: request.applymarker(pytest.mark.xfail) @@ -55,6 +42,19 @@ def test_drop_strict( df = nw.from_native(constructor(data)) + context: Any + if strict: + if "polars_lazy" in str(constructor): + msg = r"^\"z\"" + else: + msg = ( + r"The following columns were not found: \[.*\]" + r"\n\nHint: Did you mean one of these columns: \['a', 'b'\]?" + ) + context = pytest.raises(ColumnNotFoundError, match=msg) + else: + context = does_not_raise() + with context: names_out = df.drop(to_drop, strict=strict).collect_schema().names() assert names_out == ["b"] diff --git a/tests/frame/select_test.py b/tests/frame/select_test.py index 72bfcf9cc5..9819b34046 100644 --- a/tests/frame/select_test.py +++ b/tests/frame/select_test.py @@ -127,7 +127,7 @@ def maybe_collect(df: nw.LazyFrame[Any]) -> nw.DataFrame[Any] | nw.LazyFrame[Any return df if constructor_id == "polars[lazy]": - msg = r"^e|\"(e|f)\"" + msg = r"^e" elif any(id_ == constructor_id for id_ in ("duckdb", "pyspark")): msg = r"\n\nHint: Did you mean one of these columns: \['a', 'b', 'z'\]?" else: @@ -137,12 +137,6 @@ def maybe_collect(df: nw.LazyFrame[Any]) -> nw.DataFrame[Any] | nw.LazyFrame[Any ) with pytest.raises(ColumnNotFoundError, match=msg): maybe_collect(df.select(selected_columns)) - if constructor_id == "polars[lazy]" and POLARS_VERSION < (1,): # pragma: no cover - # Old Polars versions wouldn't raise an error at all here - pass - else: - with pytest.raises(ColumnNotFoundError, match=msg): - maybe_collect(df.drop(selected_columns, strict=True)) if "polars" in str(constructor_lazy): msg = r"^fdfa" with pytest.raises(ColumnNotFoundError, match=msg): From 823d356ffe4173ed3fd9e8ee9065d39b706b56df Mon Sep 17 00:00:00 2001 From: Edoardo Abati <29585319+EdAbati@users.noreply.github.com> Date: Tue, 13 May 2025 08:10:45 +0200 Subject: [PATCH 14/18] fix msg regex --- tests/frame/filter_test.py | 9 +++++---- tests/frame/with_columns_test.py | 12 ++++++------ 2 files changed, 11 insertions(+), 10 deletions(-) diff --git a/tests/frame/filter_test.py b/tests/frame/filter_test.py index a4876a5f95..197f377a64 100644 --- a/tests/frame/filter_test.py +++ b/tests/frame/filter_test.py @@ -79,12 +79,13 @@ def test_filter_missing_column( data = {"a": [1, 2], "b": [3, 4]} df = nw.from_native(constructor(data)) - msg = ( - r"The following columns were not found: \[.*\]" - r"\n\nHint: Did you mean one of these columns: \['a', 'b'\]?" - ) if "polars" in str(constructor): msg = r"^unable to find column \"c\"; valid columns: \[\"a\", \"b\"\]" + else: + msg = ( + r"The following columns were not found: \[.*\]" + r"\n\nHint: Did you mean one of these columns: \['a', 'b'\]?" + ) if "polars_lazy" in str(constructor) and isinstance(df, nw.LazyFrame): with pytest.raises(ColumnNotFoundError, match=msg): diff --git a/tests/frame/with_columns_test.py b/tests/frame/with_columns_test.py index 67383ce37c..6e701b14fa 100644 --- a/tests/frame/with_columns_test.py +++ b/tests/frame/with_columns_test.py @@ -89,13 +89,13 @@ def test_with_columns_missing_column( data = {"a": [1, 2], "b": [3, 4]} df = nw.from_native(constructor(data)) - msg = ( - r"The following columns were not found: \[.*\]" - r"\n\nHint: Did you mean one of these columns: \['a', 'b'\]?" - ) if "polars" in str(constructor): - msg = r"^c\n\n" - + msg = r"^c" + else: + msg = ( + r"The following columns were not found: \[.*\]" + r"\n\nHint: Did you mean one of these columns: \['a', 'b'\]?" + ) if "polars_lazy" in str(constructor) and isinstance(df, nw.LazyFrame): # In the lazy case, Polars only errors when we call `collect` with pytest.raises(ColumnNotFoundError, match=msg): From 5647f52d806dea5687745290f84092991f7a2a5d Mon Sep 17 00:00:00 2001 From: Edoardo Abati <29585319+EdAbati@users.noreply.github.com> Date: Tue, 13 May 2025 08:45:43 +0200 Subject: [PATCH 15/18] update with columns --- narwhals/_duckdb/dataframe.py | 5 ++++- narwhals/_spark_like/dataframe.py | 7 +++++++ tests/frame/with_columns_test.py | 6 +++++- 3 files changed, 16 insertions(+), 2 deletions(-) diff --git a/narwhals/_duckdb/dataframe.py b/narwhals/_duckdb/dataframe.py index 48338170b0..3e4eb0ad74 100644 --- a/narwhals/_duckdb/dataframe.py +++ b/narwhals/_duckdb/dataframe.py @@ -213,7 +213,10 @@ def with_columns(self, *exprs: DuckDBExpr) -> Self: for name in self.columns ] result.extend(value.alias(name) for name, value in new_columns_map.items()) - return self._with_native(self.native.select(*result)) + try: + return self._with_native(self.native.select(*result)) + except duckdb.BinderException as e: + raise ColumnNotFoundError.from_available_column_names(self.columns) from e def filter(self, predicate: DuckDBExpr) -> Self: # `[0]` is safe as the predicate's expression only returns a single column diff --git a/narwhals/_spark_like/dataframe.py b/narwhals/_spark_like/dataframe.py index 6801cc0db8..ed0384dc63 100644 --- a/narwhals/_spark_like/dataframe.py +++ b/narwhals/_spark_like/dataframe.py @@ -286,6 +286,13 @@ def select( def with_columns(self, *exprs: SparkLikeExpr) -> Self: new_columns = evaluate_exprs(self, *exprs) + if self._implementation.is_pyspark(): # pragma: no cover + from pyspark.errors import AnalysisException + + try: + return self._with_native(self.native.withColumns(dict(new_columns))) + except AnalysisException as e: + raise ColumnNotFoundError.from_available_column_names(self.columns) from e return self._with_native(self.native.withColumns(dict(new_columns))) def filter(self, predicate: SparkLikeExpr) -> Self: diff --git a/tests/frame/with_columns_test.py b/tests/frame/with_columns_test.py index 6e701b14fa..f2bf03e361 100644 --- a/tests/frame/with_columns_test.py +++ b/tests/frame/with_columns_test.py @@ -84,13 +84,17 @@ def test_with_columns_series_shape_mismatch(constructor_eager: ConstructorEager) def test_with_columns_missing_column( constructor: Constructor, request: pytest.FixtureRequest ) -> None: - if any(x in str(constructor) for x in ("pyspark", "duckdb", "sqlframe")): + constructor_id = str(request.node.callspec.id) + if any(id_ == constructor_id for id_ in ("sqlframe", "pyspark[connect]")): + # These backend raise errors at collect request.applymarker(pytest.mark.xfail) data = {"a": [1, 2], "b": [3, 4]} df = nw.from_native(constructor(data)) if "polars" in str(constructor): msg = r"^c" + elif any(id_ == constructor_id for id_ in ("duckdb", "pyspark")): + msg = r"\n\nHint: Did you mean one of these columns: \['a', 'b'\]?" else: msg = ( r"The following columns were not found: \[.*\]" From 1cb3204e1b3bb09ce9879bdbfdacbaee35f1223e Mon Sep 17 00:00:00 2001 From: Edoardo Abati <29585319+EdAbati@users.noreply.github.com> Date: Tue, 13 May 2025 08:51:59 +0200 Subject: [PATCH 16/18] update filter --- narwhals/_duckdb/dataframe.py | 5 ++++- narwhals/_spark_like/dataframe.py | 10 ++++++++-- tests/frame/filter_test.py | 5 ++++- 3 files changed, 16 insertions(+), 4 deletions(-) diff --git a/narwhals/_duckdb/dataframe.py b/narwhals/_duckdb/dataframe.py index 3e4eb0ad74..38d160c930 100644 --- a/narwhals/_duckdb/dataframe.py +++ b/narwhals/_duckdb/dataframe.py @@ -221,7 +221,10 @@ def with_columns(self, *exprs: DuckDBExpr) -> Self: def filter(self, predicate: DuckDBExpr) -> Self: # `[0]` is safe as the predicate's expression only returns a single column mask = predicate(self)[0] - return self._with_native(self.native.filter(mask)) + try: + return self._with_native(self.native.filter(mask)) + except duckdb.BinderException as e: + raise ColumnNotFoundError.from_available_column_names(self.columns) from e @property def schema(self) -> dict[str, DType]: diff --git a/narwhals/_spark_like/dataframe.py b/narwhals/_spark_like/dataframe.py index ed0384dc63..3d041cbb33 100644 --- a/narwhals/_spark_like/dataframe.py +++ b/narwhals/_spark_like/dataframe.py @@ -298,8 +298,14 @@ def with_columns(self, *exprs: SparkLikeExpr) -> Self: def filter(self, predicate: SparkLikeExpr) -> Self: # `[0]` is safe as the predicate's expression only returns a single column condition = predicate._call(self)[0] - spark_df = self.native.where(condition) - return self._with_native(spark_df) + if self._implementation.is_pyspark(): + from pyspark.errors import AnalysisException + + try: + return self._with_native(self.native.where(condition)) + except AnalysisException as e: + raise ColumnNotFoundError.from_available_column_names(self.columns) from e + return self._with_native(self.native.where(condition)) @property def schema(self) -> dict[str, DType]: diff --git a/tests/frame/filter_test.py b/tests/frame/filter_test.py index 197f377a64..db64cb7ff4 100644 --- a/tests/frame/filter_test.py +++ b/tests/frame/filter_test.py @@ -74,13 +74,16 @@ def test_filter_with_constrains(constructor: Constructor) -> None: def test_filter_missing_column( constructor: Constructor, request: pytest.FixtureRequest ) -> None: - if any(x in str(constructor) for x in ("pyspark", "duckdb", "sqlframe")): + constructor_id = str(request.node.callspec.id) + if any(id_ == constructor_id for id_ in ("sqlframe", "pyspark[connect]")): request.applymarker(pytest.mark.xfail) data = {"a": [1, 2], "b": [3, 4]} df = nw.from_native(constructor(data)) if "polars" in str(constructor): msg = r"^unable to find column \"c\"; valid columns: \[\"a\", \"b\"\]" + elif any(id_ == constructor_id for id_ in ("duckdb", "pyspark")): + msg = r"\n\nHint: Did you mean one of these columns: \['a', 'b'\]?" else: msg = ( r"The following columns were not found: \[.*\]" From 901bdf7fcc2f1c3445c719544310626b900bd835 Mon Sep 17 00:00:00 2001 From: Edoardo Abati <29585319+EdAbati@users.noreply.github.com> Date: Tue, 13 May 2025 09:00:19 +0200 Subject: [PATCH 17/18] catch more precise error --- narwhals/_duckdb/dataframe.py | 12 +++++++++--- narwhals/_spark_like/dataframe.py | 18 +++++++++++++++--- 2 files changed, 24 insertions(+), 6 deletions(-) diff --git a/narwhals/_duckdb/dataframe.py b/narwhals/_duckdb/dataframe.py index 38d160c930..6f4ff045b3 100644 --- a/narwhals/_duckdb/dataframe.py +++ b/narwhals/_duckdb/dataframe.py @@ -187,7 +187,9 @@ def select( try: return self._with_native(self.native.select(*selection)) except duckdb.BinderException as e: - raise ColumnNotFoundError.from_available_column_names(self.columns) from e + if "this column cannot be referenced before it is defined" in str(e): + raise ColumnNotFoundError.from_available_column_names(self.columns) from e + raise def drop(self, columns: Sequence[str], *, strict: bool) -> Self: columns_to_drop = parse_columns_to_drop(self, columns=columns, strict=strict) @@ -216,7 +218,9 @@ def with_columns(self, *exprs: DuckDBExpr) -> Self: try: return self._with_native(self.native.select(*result)) except duckdb.BinderException as e: - raise ColumnNotFoundError.from_available_column_names(self.columns) from e + if "not found in FROM clause" in str(e): + raise ColumnNotFoundError.from_available_column_names(self.columns) from e + raise def filter(self, predicate: DuckDBExpr) -> Self: # `[0]` is safe as the predicate's expression only returns a single column @@ -224,7 +228,9 @@ def filter(self, predicate: DuckDBExpr) -> Self: try: return self._with_native(self.native.filter(mask)) except duckdb.BinderException as e: - raise ColumnNotFoundError.from_available_column_names(self.columns) from e + if "not found in FROM clause" in str(e): + raise ColumnNotFoundError.from_available_column_names(self.columns) from e + raise @property def schema(self) -> dict[str, DType]: diff --git a/narwhals/_spark_like/dataframe.py b/narwhals/_spark_like/dataframe.py index 3d041cbb33..5418c3769e 100644 --- a/narwhals/_spark_like/dataframe.py +++ b/narwhals/_spark_like/dataframe.py @@ -281,7 +281,11 @@ def select( try: return self._with_native(self.native.select(*new_columns_list)) except AnalysisException as e: - raise ColumnNotFoundError.from_available_column_names(self.columns) from e + if str(e).startswith("[UNRESOLVED_COLUMN.WITH_SUGGESTION]"): + raise ColumnNotFoundError.from_available_column_names( + self.columns + ) from e + raise return self._with_native(self.native.select(*new_columns_list)) def with_columns(self, *exprs: SparkLikeExpr) -> Self: @@ -292,7 +296,11 @@ def with_columns(self, *exprs: SparkLikeExpr) -> Self: try: return self._with_native(self.native.withColumns(dict(new_columns))) except AnalysisException as e: - raise ColumnNotFoundError.from_available_column_names(self.columns) from e + if str(e).startswith("[UNRESOLVED_COLUMN.WITH_SUGGESTION]"): + raise ColumnNotFoundError.from_available_column_names( + self.columns + ) from e + raise return self._with_native(self.native.withColumns(dict(new_columns))) def filter(self, predicate: SparkLikeExpr) -> Self: @@ -304,7 +312,11 @@ def filter(self, predicate: SparkLikeExpr) -> Self: try: return self._with_native(self.native.where(condition)) except AnalysisException as e: - raise ColumnNotFoundError.from_available_column_names(self.columns) from e + if str(e).startswith("[UNRESOLVED_COLUMN.WITH_SUGGESTION]"): + raise ColumnNotFoundError.from_available_column_names( + self.columns + ) from e + raise return self._with_native(self.native.where(condition)) @property From 543252169e70b3a32c005a077f9ff61c09609baf Mon Sep 17 00:00:00 2001 From: Edoardo Abati <29585319+EdAbati@users.noreply.github.com> Date: Tue, 13 May 2025 09:07:54 +0200 Subject: [PATCH 18/18] remove redundant test --- tests/frame/select_test.py | 7 ------- 1 file changed, 7 deletions(-) diff --git a/tests/frame/select_test.py b/tests/frame/select_test.py index 9819b34046..338983f9c6 100644 --- a/tests/frame/select_test.py +++ b/tests/frame/select_test.py @@ -12,7 +12,6 @@ from narwhals.exceptions import NarwhalsError from tests.utils import DASK_VERSION from tests.utils import DUCKDB_VERSION -from tests.utils import POLARS_VERSION from tests.utils import Constructor from tests.utils import ConstructorEager from tests.utils import ConstructorLazy @@ -94,12 +93,6 @@ def test_missing_columns_eager(constructor_eager: ConstructorEager) -> None: ) with pytest.raises(ColumnNotFoundError, match=msg): df.select(selected_columns) - if "polars" in str(constructor_eager) and POLARS_VERSION < (1,): # pragma: no cover - # Old Polars versions wouldn't raise an error at all here - pass - else: - with pytest.raises(ColumnNotFoundError, match=msg): - df.drop(selected_columns, strict=True) if "polars" in str(constructor_eager): msg = r"\n\nHint: Did you mean one of these columns: \['a', 'b', 'z'\]?" with pytest.raises(ColumnNotFoundError, match=msg):