Skip to content

fix: unify ColumnNotFound for duckdb and pyspark #2493

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Open
wants to merge 17 commits into
base: main
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
5 changes: 4 additions & 1 deletion narwhals/_duckdb/dataframe.py
Original file line number Diff line number Diff line change
Expand Up @@ -184,7 +184,10 @@ def select(
*exprs: DuckDBExpr,
) -> Self:
selection = (val.alias(name) for name, val in evaluate_exprs(self, *exprs))
return self._with_native(self.native.select(*selection))
try:
return self._with_native(self.native.select(*selection))
except duckdb.BinderException as e:
raise ColumnNotFoundError.from_available_column_names(self.columns) from e
Comment on lines +187 to +190
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

do we risk catching other errors with this? BinderException might be a bit broad, shall we also match on str(e) before raising ColumnNotFoundError?

we should probably also do this for:

  • with_columns
  • simple_select
  • filter

?

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@MarcoGorelli this has reminded me about an issue with SparkLikeLazyFrame

def _to_arrow_schema(self) -> pa.Schema: # pragma: no cover
import pyarrow as pa # ignore-banned-import
from narwhals._arrow.utils import narwhals_to_native_dtype
schema: list[tuple[str, pa.DataType]] = []
nw_schema = self.collect_schema()
native_schema = self.native.schema
for key, value in nw_schema.items():
try:
native_dtype = narwhals_to_native_dtype(value, self._version)
except Exception as exc: # noqa: BLE001,PERF203
native_spark_dtype = native_schema[key].dataType # type: ignore[index]

This one is a bigger problem because it captures CTRL+C, so you can't easily stop the test suite while it's running

Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Good point @MarcoGorelli !

Regarding, simple_select we are already catching a blind Exception:

if flat_exprs and all(isinstance(x, str) for x in flat_exprs) and not named_exprs:
# fast path!
try:
return self._with_compliant(
self._compliant_frame.simple_select(*flat_exprs)
)
except Exception as e:
# Column not found is the only thing that can realistically be raised here.
available_columns = self.columns
missing_columns = [x for x in flat_exprs if x not in available_columns]
raise ColumnNotFoundError.from_missing_and_available_column_names(
missing_columns, available_columns
) from e

And should already be caught by the first test:

if isinstance(df, nw.LazyFrame):
with pytest.raises(ColumnNotFoundError, match=msg):
df.select(selected_columns).collect()
else:
with pytest.raises(ColumnNotFoundError, match=msg):
df.select(selected_columns)

Maybe we should have a _missing_column_exception property in BaseFrame that should be specified for each backend. So here we can catch the exact exception here.
What do you think? I could do that in a separate PR to see how it looks like

Regarding filter and with_column, I think we are not testing what happens if we use not existing columns in these methods yet, or am I missing something? I can make another small PR for just this


def drop(self, columns: Sequence[str], *, strict: bool) -> Self:
columns_to_drop = parse_columns_to_drop(self, columns=columns, strict=strict)
Expand Down
8 changes: 8 additions & 0 deletions narwhals/_spark_like/dataframe.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,7 @@
from narwhals._spark_like.utils import import_native_dtypes
from narwhals._spark_like.utils import import_window
from narwhals._spark_like.utils import native_to_narwhals_dtype
from narwhals.exceptions import ColumnNotFoundError
from narwhals.exceptions import InvalidOperationError
from narwhals.typing import CompliantLazyFrame
from narwhals.utils import Implementation
Expand Down Expand Up @@ -274,6 +275,13 @@ def select(
) -> Self:
new_columns = evaluate_exprs(self, *exprs)
new_columns_list = [col.alias(col_name) for (col_name, col) in new_columns]
if self._implementation.is_pyspark(): # pragma: no cover
from pyspark.errors import AnalysisException

try:
return self._with_native(self.native.select(*new_columns_list))
except AnalysisException as e:
raise ColumnNotFoundError.from_available_column_names(self.columns) from e
return self._with_native(self.native.select(*new_columns_list))

def with_columns(self, *exprs: SparkLikeExpr) -> Self:
Expand Down
10 changes: 10 additions & 0 deletions narwhals/exceptions.py
Original file line number Diff line number Diff line change
Expand Up @@ -38,6 +38,16 @@ def from_missing_and_available_column_names(
)
return ColumnNotFoundError(message)

@classmethod
def from_available_column_names(
cls: type, available_columns: list[str]
) -> ColumnNotFoundError:
message = (
"The selected columns were not found."
f"\n\nHint: Did you mean one of these columns: {available_columns}?"
)
return ColumnNotFoundError(message)


class ComputeError(NarwhalsError):
"""Exception raised when the underlying computation could not be evaluated."""
Expand Down
13 changes: 12 additions & 1 deletion tests/conftest.py
Original file line number Diff line number Diff line change
Expand Up @@ -27,6 +27,7 @@
from narwhals.typing import NativeLazyFrame
from tests.utils import Constructor
from tests.utils import ConstructorEager
from tests.utils import ConstructorLazy

Data: TypeAlias = "dict[str, list[Any]]"

Expand Down Expand Up @@ -227,7 +228,7 @@ def sqlframe_pyspark_lazy_constructor(obj: Data) -> SQLFrameDataFrame: # pragma
"cudf": cudf_constructor,
"polars[eager]": polars_eager_constructor,
}
LAZY_CONSTRUCTORS: dict[str, Constructor] = {
LAZY_CONSTRUCTORS: dict[str, ConstructorLazy] = {
"dask": dask_lazy_p2_constructor,
"polars[lazy]": polars_lazy_constructor,
"duckdb": duckdb_lazy_constructor,
Expand Down Expand Up @@ -259,6 +260,8 @@ def pytest_generate_tests(metafunc: pytest.Metafunc) -> None:

eager_constructors: list[ConstructorEager] = []
eager_constructors_ids: list[str] = []
lazy_constructors: list[ConstructorLazy] = []
lazy_constructors_ids: list[str] = []
constructors: list[Constructor] = []
constructors_ids: list[str] = []

Expand All @@ -274,8 +277,12 @@ def pytest_generate_tests(metafunc: pytest.Metafunc) -> None:
eager_constructors_ids.append(constructor)
constructors.append(EAGER_CONSTRUCTORS[constructor])
elif constructor in {"pyspark", "pyspark[connect]"}: # pragma: no cover
lazy_constructors.append(pyspark_lazy_constructor())
lazy_constructors_ids.append(constructor)
constructors.append(pyspark_lazy_constructor())
elif constructor in LAZY_CONSTRUCTORS:
lazy_constructors.append(LAZY_CONSTRUCTORS[constructor])
lazy_constructors_ids.append(constructor)
constructors.append(LAZY_CONSTRUCTORS[constructor])
else: # pragma: no cover
msg = f"Expected one of {EAGER_CONSTRUCTORS.keys()} or {LAZY_CONSTRUCTORS.keys()}, got {constructor}"
Expand All @@ -286,5 +293,9 @@ def pytest_generate_tests(metafunc: pytest.Metafunc) -> None:
metafunc.parametrize(
"constructor_eager", eager_constructors, ids=eager_constructors_ids
)
elif "constructor_lazy" in metafunc.fixturenames:
metafunc.parametrize(
"constructor_lazy", lazy_constructors, ids=lazy_constructors_ids
)
elif "constructor" in metafunc.fixturenames:
metafunc.parametrize("constructor", constructors, ids=constructors_ids)
86 changes: 54 additions & 32 deletions tests/frame/select_test.py
Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I split the test into lazy and eager to simplify a bit the if-else statements. I hope it is a bit more readable ?

Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,7 @@
from tests.utils import POLARS_VERSION
from tests.utils import Constructor
from tests.utils import ConstructorEager
from tests.utils import ConstructorLazy
from tests.utils import assert_equal_data


Expand Down Expand Up @@ -83,48 +84,69 @@ def test_comparison_with_list_error_message() -> None:
nw.from_native(pd.Series([[1, 2, 3]]), series_only=True) == [1, 2, 3] # noqa: B015


def test_missing_columns(
constructor: Constructor, request: pytest.FixtureRequest
) -> None:
if ("pyspark" in str(constructor)) or "duckdb" in str(constructor):
request.applymarker(pytest.mark.xfail)
def test_missing_columns_eager(constructor_eager: ConstructorEager) -> None:
data = {"a": [1, 3, 2], "b": [4, 4, 6], "z": [7.0, 8.0, 9.0]}
df = nw.from_native(constructor(data))
df = nw.from_native(constructor_eager(data))
selected_columns = ["a", "e", "f"]
msg = (
r"The following columns were not found: \[.*\]"
r"\n\nHint: Did you mean one of these columns: \['a', 'b', 'z'\]?"
)
if "polars" in str(constructor):
# In the lazy case, Polars only errors when we call `collect`,
# and we have no way to recover exactly which columns the user
# tried selecting. So, we just emit their message (which varies
# across versions...)
msg = "e|f"
if isinstance(df, nw.LazyFrame):
with pytest.raises(ColumnNotFoundError, match=msg):
df.select(selected_columns).collect()
else:
with pytest.raises(ColumnNotFoundError, match=msg):
df.select(selected_columns)
if POLARS_VERSION >= (1,):
# Old Polars versions wouldn't raise an error
# at all here
if isinstance(df, nw.LazyFrame):
with pytest.raises(ColumnNotFoundError, match=msg):
df.drop(selected_columns, strict=True).collect()
else:
with pytest.raises(ColumnNotFoundError, match=msg):
df.drop(selected_columns, strict=True)
else: # pragma: no cover
pass
with pytest.raises(ColumnNotFoundError, match=msg):
df.select(selected_columns)
if "polars" in str(constructor_eager) and POLARS_VERSION < (1,): # pragma: no cover
# Old Polars versions wouldn't raise an error at all here
pass
else:
with pytest.raises(ColumnNotFoundError, match=msg):
df.select(selected_columns)
with pytest.raises(ColumnNotFoundError, match=msg):
df.drop(selected_columns, strict=True)
if "polars" in str(constructor_eager):
msg = r"\n\nHint: Did you mean one of these columns: \['a', 'b', 'z'\]?"
with pytest.raises(ColumnNotFoundError, match=msg):
df.select(nw.col("fdfa"))
Comment on lines +105 to +106
Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

before this was not tested for polars



def test_missing_columns_lazy(
constructor_lazy: ConstructorLazy, request: pytest.FixtureRequest
) -> None:
constructor_id = str(request.node.callspec.id)
if any(id_ == constructor_id for id_ in ("sqlframe", "pyspark[connect]")):
Copy link
Collaborator Author

@EdAbati EdAbati May 9, 2025

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

sqlframe and pystpark.connect raise errors at collect. πŸ˜•

I need to double check pystpark.connect. Currently cannot set it up locally... Working on it ⏳

Do you have an idea on how to deal with these?

# These backend raise errors at collect
request.applymarker(pytest.mark.xfail)
data = {"a": [1, 3, 2], "b": [4, 4, 6], "z": [7.0, 8.0, 9.0]}
df = nw.from_native(constructor_lazy(data))
selected_columns = ["a", "e", "f"]

def maybe_collect(df: nw.LazyFrame[Any]) -> nw.DataFrame[Any] | nw.LazyFrame[Any]:
if constructor_id == "polars[lazy]":
# In the lazy case, Polars only errors when we call `collect`,
# and we have no way to recover exactly which columns the user
# tried selecting. So, we just emit their message (which varies
# across versions...)
return df.collect()
return df

if constructor_id == "polars[lazy]":
msg = r"^e|\"(e|f)\""
Copy link
Collaborator Author

@EdAbati EdAbati May 9, 2025

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Before it was msg = "e|f". Now it is a bit stricter

elif any(id_ == constructor_id for id_ in ("duckdb", "pyspark")):
msg = r"\n\nHint: Did you mean one of these columns: \['a', 'b', 'z'\]?"
else:
msg = (
r"The following columns were not found: \[.*\]"
r"\n\nHint: Did you mean one of these columns: \['a', 'b', 'z'\]?"
)
with pytest.raises(ColumnNotFoundError, match=msg):
maybe_collect(df.select(selected_columns))
if constructor_id == "polars[lazy]" and POLARS_VERSION < (1,): # pragma: no cover
# Old Polars versions wouldn't raise an error at all here
pass
else:
with pytest.raises(ColumnNotFoundError, match=msg):
df.select(nw.col("fdfa"))
maybe_collect(df.drop(selected_columns, strict=True))
if "polars" in str(constructor_lazy):
msg = r"^fdfa"
with pytest.raises(ColumnNotFoundError, match=msg):
maybe_collect(df.select(nw.col("fdfa")))


def test_left_to_right_broadcasting(constructor: Constructor) -> None:
Expand Down
1 change: 1 addition & 0 deletions tests/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -46,6 +46,7 @@ def get_module_version_as_tuple(module_name: str) -> tuple[int, ...]:

Constructor: TypeAlias = Callable[[Any], "NativeLazyFrame | NativeFrame | DataFrameLike"]
ConstructorEager: TypeAlias = Callable[[Any], "NativeFrame | DataFrameLike"]
ConstructorLazy: TypeAlias = Callable[[Any], "NativeLazyFrame"]


def zip_strict(left: Sequence[Any], right: Sequence[Any]) -> Iterator[Any]:
Expand Down
Loading