Skip to content

enh: internals pluggable backends #2001

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Draft
wants to merge 4 commits into
base: main
Choose a base branch
from
Draft
Show file tree
Hide file tree
Changes from 1 commit
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
360 changes: 360 additions & 0 deletions narwhals/backends.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,360 @@
from __future__ import annotations

import sys
from dataclasses import dataclass
from dataclasses import field
from dataclasses import replace
from importlib import import_module
from typing import TYPE_CHECKING
from typing import Any
from typing import Callable
from typing import Generator
from typing import TypeVar
from typing import cast

from narwhals.dataframe import DataFrame
from narwhals.dataframe import LazyFrame
from narwhals.series import Series
from narwhals.utils import Implementation
from narwhals.utils import Version
from narwhals.utils import parse_version

if TYPE_CHECKING:
from types import ModuleType

T = TypeVar("T")


BACKENDS = []


@dataclass
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Personally it would be nice to have it kw_only - although I know that's supported from python 3.10 for dataclasses πŸ₯²

class Adaptation:
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

TODO: some docstring πŸ™ƒ

narwhals: type
native: str | type
adapter: str | type
level: str
kwargs: dict[str, Any] = field(default_factory=dict)
version: Version | None = None
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Not sure if we should allow version to be None. After all, we set that based upon from where from_native is called πŸ€”


@property
def imported_adapter(self) -> type:
if isinstance(self.adapter, str):
return dynamic_import(self.adapter)
return self.adapter

@property
def imported_native(self) -> type:
if isinstance(self.native, str):
return dynamic_import(self.native)
return self.native


@dataclass
class Backend:
requires: list[tuple[str, str | Callable, tuple[int, ...]]]
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Might be worth considering a list[namedtuple] to improve readability

adaptations: list[Adaptation]
implementation: Implementation | None = None

def __post_init__(self) -> None:
adaptations = []
for adapt in self.adaptations:
if adapt.version in Version:
adaptations.append(adapt)
elif adapt.version is None:
adaptations.extend(replace(adapt, version=v) for v in Version)
else:
msg = "Adaptation.version must be {Version!r} or None, got {adapt.version!r}"
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Suggested change
msg = "Adaptation.version must be {Version!r} or None, got {adapt.version!r}"
msg = f"Adaptation.version must be {Version!r} or None, got {adapt.version!r}"

raise TypeError(msg)
self.adaptations = adaptations

def get_adapter(
self, cls: type, version: Version = Version.MAIN
) -> Adaptation | None:
module_name, *_ = cls.__module__.split(".", maxsplit=1)
for adapt in self.adaptations:
if adapt.version != version:
continue

if isinstance(adapt.native, type) and cls is adapt.native:
return adapt

elif isinstance(adapt.native, str):
adapt.native = cast(str, adapt.native)
adapt_module_name, *_, adapt_cls_name = adapt.native.split(".")
if (
(adapt_module_name in sys.modules) # base-module is imported
and (module_name == adapt_module_name) # roots match
and (cls.__name__ == adapt_cls_name) # tips match
and (cls is dynamic_import(adapt.native)) # types are identical
):
return adapt
return None

def validate_backend_version(self) -> None:
for module_name, version_getter, min_version in self.requires:
# TODO(camriddell): this logic may be better suited for a Version namedtuple or dataclass
if callable(version_getter):
version_str = version_getter()
elif isinstance(version_getter, str):
version_str = dynamic_import(version_getter)
else:
msg = "version_getter {version_getter!r} must be a string or callable, got {type(version_getter)}"
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Suggested change
msg = "version_getter {version_getter!r} must be a string or callable, got {type(version_getter)}"
msg = f"version_getter {version_getter!r} must be a string or callable, got {type(version_getter)}"

πŸ™ˆ

raise TypeError(msg)

installed_version = parse_version(version_str)
if installed_version < min_version:
msg = f"{module_name} must be updated to at least {min_version}, got {installed_version}"
raise ValueError(msg)

def version(self) -> tuple[int, ...]:
version_getter = self.requires[0][1]
# TODO(camriddell): this logic may be better suited for a Version namedtuple or dataclass
if callable(version_getter):
version_str = version_getter()
elif isinstance(version_getter, str):
version_str = dynamic_import(version_getter)
else:
msg = "version_getter {version_getter!r} must be a string or callable, got {type(version_getter)}"
raise TypeError(msg)
return parse_version(version_str)

def native_namespace(self) -> ModuleType:
return import_module(self.requires[0][0])

def get_native_namespace(self) -> ModuleType | None:
return sys.modules.get(self.requires[0][0], None)
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

General comment, for which I didn't really think of an alternative, is that I don't like too much the fact that self.requires is used implicitly 3 times with the first item in the list.

One idea could be to have a distinction between (let's maybe get better names):

  • backend_library -> this is where the adaptation.native class comes from
    • In the proposed implementation this is dynamically extracted from adaptation.native, but then for native_namespace the first tuple in requires is used
  • extra_requires -> potential list of extra requirements



def register_backends(*backends: Backend) -> None:
for b in backends:
BACKENDS.append(b) # noqa: PERF402
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

nitpick:

Suggested change
for b in backends:
BACKENDS.append(b) # noqa: PERF402
BACKENDS.extend(backends)

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

🀦 not a nitpick, you can even see that I ignored ruff while working quickly to get this up.

On this topic, do you have any thoughts about BACKENDS being a mutable global entity? I am stuck on this consideration vs making it immutable and allowing users to pass extra backends through the from_native function.

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

On this topic, do you have any thoughts about BACKENDS being a mutable global entity?

I feel uncomfortable as well with having such global variable.
I am not very familiar with the pattern - but what about something like a BackendRegistry class in which it's possible to set/register backends but not delete them (explicitly)? I am thinking out loud here



def traverse_rsplits(text: str, sep: str = " ") -> Generator[tuple[str, list[str]]]:
sep_count = text.count(sep)
if sep_count == 0:
yield (text, [])

for i in range(1, sep_count + 1):
base, *remaining = text.rsplit(sep, maxsplit=i)
yield base, remaining


def dynamic_import(dotted_path: str) -> Any:
for base, attributes in traverse_rsplits(dotted_path, sep="."):
if not attributes:
continue
try:
module = import_module(base)
except ImportError:
pass
else:
obj = module
for attr in attributes:
obj = getattr(obj, attr)
return obj
msg = "Could not import {dotted_path!r}"
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Ok last one

Suggested change
msg = "Could not import {dotted_path!r}"
msg = f"Could not import {dotted_path!r}"

raise ImportError(msg)


register_backends(
Backend(
requires=[
("pandas", "pandas.__version__", (0, 25, 3)),
],
adaptations=[
Adaptation(
DataFrame,
"pandas.DataFrame",
"narwhals._pandas_like.dataframe.PandasLikeDataFrame",
level="full",
kwargs={"validate_column_names": True},
),
Adaptation(
Series,
"pandas.Series",
"narwhals._pandas_like.dataframe.PandasLikeSeries",
level="full",
),
],
implementation=Implementation.PANDAS,
),
Backend(
requires=[
("polars", "polars.__version__", (0, 20, 3)),
],
adaptations=[
Adaptation(
LazyFrame,
"polars.LazyFrame",
"narwhals._polars.dataframe.PolarsLazyFrame",
level="full",
),
Adaptation(
DataFrame,
"polars.DataFrame",
"narwhals._polars.dataframe.PolarsDataFrame",
level="full",
),
Adaptation(
Series,
"polars.Series",
"narwhals._polars.series.PolarsSeries",
level="full",
),
],
),
Backend(
requires=[("modin.pandas", "modin.__version__", (0, 25, 3))],
adaptations=[
Adaptation(
DataFrame,
"modin.pandas.DataFrame",
"narwhals._pandas_like.dataframe.PandasLikeDataFrame",
level="full",
kwargs={"validate_column_names": True},
),
Adaptation(
Series,
"modin.pandas.Series",
"narwhals._pandas_like.dataframe.PandasLikeSeries",
level="full",
),
],
implementation=Implementation.MODIN,
),
Backend(
requires=[
("cudf", "cudf.__version__", (24, 10)),
],
adaptations=[
Adaptation(
DataFrame,
"cudf.DataFrame",
"narwhals._pandas_like.dataframe.PandasLikeDataFrame",
level="full",
kwargs={"validate_column_names": True},
),
Adaptation(
Series,
"cudf.Series",
"narwhals._pandas_like.dataframe.PandasLikeSeries",
level="full",
),
],
implementation=Implementation.CUDF,
),
Backend(
requires=[
("pyarrow", "pyarrow.__version__", (11,)),
],
adaptations=[
Adaptation(
DataFrame,
"pyarrow.Table",
"narwhals._arrow.dataframe.ArrowDataFrame",
level="full",
kwargs={"validate_column_names": True},
),
Adaptation(
Series,
"pyarrow.ChunkedArray",
"narwhals._arrow.series.ArrowSeries",
level="full",
kwargs={"name": ""},
),
],
),
Backend(
requires=[("pyspark.sql", "pyspark.__version__", (3, 5))],
adaptations=[
Adaptation(
LazyFrame,
"pyspark.sql.DataFrame",
"narwhals._spark.dataframe.SparkLikeLazyFrame",
level="full",
kwargs={"validate_column_names": True},
),
Adaptation(
Series,
"pyspark.sql.Series",
"narwhals._arrow.dataframe.ArrowSeries",
level="full",
),
],
implementation=Implementation.PYSPARK,
),
Backend(
requires=[
("dask.dataframe", "dask.__version__", (2024, 8)),
("dask_expr", "dask_expr.__version__", (0,)),
],
adaptations=[
Adaptation(
LazyFrame,
"dask.dataframe.DataFrame",
"narwhals._dask.dataframe.DaskLazyFrame",
level="full",
kwargs={"validate_column_names": True},
),
Adaptation(
LazyFrame,
"dask_expr.DataFrame",
"narwhals._dask.dataframe.DaskLazyFrame",
level="full",
kwargs={"validate_column_names": True},
),
],
),
Backend(
requires=[("duckdb", "duckdb.__version__", (1,))],
adaptations=[
Adaptation(
LazyFrame,
"duckdb.DuckDBPyRelation",
"narwhals._duckdb.dataframe.DuckDBLazyFrame",
level="full",
kwargs={"validate_column_names": True},
version=Version.MAIN,
),
Adaptation(
DataFrame,
"duckdb.DuckDBPyRelation",
"narwhals._duckdb.dataframe.DuckDBLazyFrame",
level="interchange",
version=Version.V1,
kwargs={"validate_column_names": True},
),
],
),
Backend(
requires=[
("ibis", "ibis.__version__", (6,)),
],
adaptations=[
Adaptation(
LazyFrame,
"ibis.expr.types.Table",
"narwhals._ibis.dataframe.IbisLazyFrame",
level="full",
kwargs={"validate_column_names": True},
),
],
),
Backend(
requires=[
("sqlframe", "sqlframe._version.__version__", (3, 14, 2)),
],
adaptations=[
Adaptation(
LazyFrame,
"sqlframe.base.dataframe.BaseDataFrame",
"narwhals._spark.dataframe.SparkLikeLazyFrame",
level="full",
kwargs={"validate_column_names": True},
),
],
implementation=Implementation.SQLFRAME,
),
)
3 changes: 2 additions & 1 deletion narwhals/dataframe.py
Original file line number Diff line number Diff line change
Expand Up @@ -26,7 +26,6 @@
from narwhals.exceptions import LengthChangingExprError
from narwhals.exceptions import OrderDependentExprError
from narwhals.schema import Schema
from narwhals.translate import to_native
from narwhals.utils import Implementation
from narwhals.utils import find_stacklevel
from narwhals.utils import flatten
Expand Down Expand Up @@ -2364,6 +2363,8 @@ def to_native(self: Self) -> FrameT:
β””β”€β”€β”€β”€β”€β”€β”€β”΄β”€β”€β”€β”€β”€β”€β”€β”˜
<BLANKLINE>
"""
from narwhals.translate import to_native

return to_native(narwhals_object=self, pass_through=False)
Comment on lines +2344 to 2346
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This can also just be:

Suggested change
from narwhals.translate import to_native
return to_native(narwhals_object=self, pass_through=False)
return self._compliant_frame._native_frame


# inherited
Expand Down
Loading
Loading