matthewwardrop
diff --git a/‎formulaic/materializers/__init__.py
Lines changed: 2 additions & 2 deletions b/‎formulaic/materializers/__init__.py
Lines changed: 2 additions & 2 deletions
diff --git a/‎formulaic/materializers/arrow.py
Lines changed: 0 additions & 50 deletions b/‎formulaic/materializers/arrow.py
Lines changed: 0 additions & 50 deletions
diff --git a/‎formulaic/materializers/base.py
Lines changed: 61 additions & 9 deletions b/‎formulaic/materializers/base.py
Lines changed: 61 additions & 9 deletions
diff --git a/‎formulaic/materializers/narwhals.py
Lines changed: 207 additions & 0 deletions b/‎formulaic/materializers/narwhals.py
Lines changed: 207 additions & 0 deletions
@@ -1,11 +1,11 @@
-from .arrow import ArrowMaterializer
 from .base import FormulaMaterializer
+from .narwhals import NarwhalsMaterializer
 from .pandas import PandasMaterializer
 from .types import ClusterBy, FactorValues, NAAction
 
 __all__ = [
-    "ArrowMaterializer",
     "FormulaMaterializer",
+    "NarwhalsMaterializer",
     "PandasMaterializer",
     # Useful types
     "ClusterBy",
 
@@ -33,6 +33,7 @@
 from formulaic.transforms import TRANSFORMS
 from formulaic.utils.cast import as_columns
 from formulaic.utils.layered_mapping import LayeredMapping
+from formulaic.utils.null_handling import find_nulls
 from formulaic.utils.stateful_transforms import stateful_eval
 from formulaic.utils.variables import Variable
 
@@ -85,26 +86,44 @@ def for_data(cls, data: Any, output: Hashable = None) -> type[FormulaMaterialize
         datacls = data.__class__
         input_type = f"{datacls.__module__}.{datacls.__qualname__}"
 
-        if input_type not in cls.REGISTERED_INPUTS:
+        materializers_supporting_input = []
+
+        if input_type in cls.REGISTERED_INPUTS:
+            materializers_supporting_input.extend(cls.REGISTERED_INPUTS[input_type])
+
+        if output is None and materializers_supporting_input:
+            return materializers_supporting_input[0]
+
+        for materializer in sorted(
+            set(cls.REGISTERED_NAMES.values()),
+            key=lambda x: x.REGISTER_PRECEDENCE,
+            reverse=True,
+        ):
+            if materializer.SUPPORTS_INPUT(data):
+                materializers_supporting_input.append(materializer)
+
+        if not materializers_supporting_input:
             raise FormulaMaterializerNotFoundError(
-                f"No materializer has been registered for input type {repr(input_type)}. Available input types are: {set(cls.REGISTER_INPUTS)}."
+                f"No materializer is available for input type {repr(input_type)}. Explicitly registered input types are: {tuple(sorted(cls.REGISTERED_INPUTS))}."
             )
 
         if output is None:
-            return cls.REGISTERED_INPUTS[input_type][0]
+            return materializers_supporting_input[0]
 
-        for materializer in cls.REGISTERED_INPUTS[input_type]:
+        for materializer in materializers_supporting_input:
             if output in materializer.REGISTER_OUTPUTS:
                 return materializer
 
         output_types: set[Hashable] = set(
-            *itertools.chain(
-                materializer.REGISTER_OUTPUTS
-                for materializer in cls.REGISTERED_INPUTS[input_type]
+            itertools.chain(
+                *[
+                    materializer.REGISTER_OUTPUTS
+                    for materializer in materializers_supporting_input
+                ]
             )
         )
         raise FormulaMaterializerNotFoundError(
-            f"No materializer has been registered for input type {repr(input_type)} that supports output type {repr(output)}. Available output types for {repr(input_type)} are: {output_types}."
+            f"No materializer is available for input type {repr(input_type)} that also supports output type {repr(output)}. Available output types for {repr(input_type)} are: {tuple(sorted(output_types, key=lambda x: str(x)))}."
         )
 
 
@@ -114,6 +133,19 @@ class FormulaMaterializer(metaclass=FormulaMaterializerMeta):
     REGISTER_OUTPUTS: Sequence[Hashable] = ()
     REGISTER_PRECEDENCE: float = 100
 
+    @classmethod
+    def SUPPORTS_INPUT(cls, data: Any) -> bool:
+        """
+        Check whether this materializer materializer supports the given data.
+        This allows for non-explicit input registration where additional
+        dynamism is required, or where this materializer should act as a
+        fallback.
+
+        Note: meterializers with explicitly registered inputs will always take
+        priority.
+        """
+        return False
+
     # Public API
 
     @inherit_docs(method="_init")
@@ -619,7 +651,27 @@ def _is_categorical(self, values: Any) -> bool:
     def _check_for_nulls(
         self, name: str, values: Any, na_action: NAAction, drop_rows: set[int]
     ) -> None:
-        pass  # pragma: no cover
+        if na_action is NAAction.IGNORE:
+            return
+
+        try:
+            null_indices = find_nulls(values)
+
+            if na_action is NAAction.RAISE:
+                if null_indices:
+                    raise ValueError(f"`{name}` contains null values after evaluation.")
+
+            elif na_action is NAAction.DROP:
+                drop_rows.update(null_indices)
+
+            else:
+                raise ValueError(
+                    f"Do not know how to interpret `na_action` = {repr(na_action)}."
+                )  # pragma: no cover; this is currently impossible to reach
+        except ValueError as e:
+            raise ValueError(
+                f"Error encountered while checking for nulls in `{name}`: {e}"
+            ) from e
 
     def _encode_evaled_factor(
         self,
 
@@ -0,0 +1,207 @@
+# pragma: no cover; TODO: experimental
+
+from __future__ import annotations
+
+import functools
+import itertools
+from collections.abc import Sequence
+from typing import TYPE_CHECKING, Any
+
+import narwhals.stable.v1 as nw
+import numpy
+import pandas
+import scipy.sparse as spsparse
+from interface_meta import override
+
+from formulaic.utils.cast import as_columns
+from formulaic.utils.null_handling import drop_rows as drop_nulls
+
+from .base import FormulaMaterializer
+
+if TYPE_CHECKING:  # pragma: no cover
+    from formulaic.model_spec import ModelSpec
+
+
+class NarwhalsMaterializer(FormulaMaterializer):
+    REGISTER_NAME = "narwhals"
+    REGISTER_INPUTS: Sequence[str] = (
+        "narwhals.DataFrame",
+        "narwhals.stable.v1.DataFrame",
+    )
+    REGISTER_OUTPUTS: Sequence[str] = ("narwhals", "pandas", "numpy", "sparse")
+
+    @override
+    @classmethod
+    def SUPPORTS_INPUT(cls, data: Any) -> bool:
+        return nw.dependencies.is_into_dataframe(data)
+
+    @override
+    def _init(self) -> None:
+        self.__narwhals_data = nw.from_native(self.data, eager_only=True)
+        self.__data_context = self.__narwhals_data.to_dict()
+
+    @override  # type: ignore
+    @property
+    def data_context(self):
+        return self.__data_context
+
+    @override
+    def _is_categorical(self, values: Any) -> bool:
+        if nw.dependencies.is_narwhals_series(values):
+            if not values.dtype.is_numeric():
+                return True
+        return super()._is_categorical(values)
+
+    @override
+    def _encode_constant(
+        self,
+        value: Any,
+        metadata: Any,
+        encoder_state: dict[str, Any],
+        spec: ModelSpec,
+        drop_rows: Sequence[int],
+    ) -> Any:
+        nrows = self.nrows - len(drop_rows)
+        if spec.output == "sparse":
+            return spsparse.csc_matrix(numpy.array([value] * nrows).reshape((nrows, 1)))
+        series = value * numpy.ones(nrows)
+        return series
+
+    @override
+    def _encode_numerical(
+        self,
+        values: Any,
+        metadata: Any,
+        encoder_state: dict[str, Any],
+        spec: ModelSpec,
+        drop_rows: Sequence[int],
+    ) -> Any:
+        if drop_rows:
+            values = drop_nulls(values, indices=drop_rows)
+        if spec.output == "sparse":
+            return spsparse.csc_matrix(
+                numpy.array(values).reshape((values.shape[0], 1))
+            )
+        return values
+
+    @override
+    def _encode_categorical(
+        self,
+        values: Any,
+        metadata: Any,
+        encoder_state: dict[str, Any],
+        spec: ModelSpec,
+        drop_rows: Sequence[int],
+        reduced_rank: bool = False,
+    ) -> Any:
+        # Even though we could reduce rank here, we do not, so that the same
+        # encoding can be cached for both reduced and unreduced rank. The
+        # rank will be reduced in the _encode_evaled_factor method.
+        from formulaic.transforms import encode_contrasts
+
+        if drop_rows:
+            values = drop_nulls(values, indices=drop_rows)
+        if nw.dependencies.is_narwhals_series(values):
+            values = values.to_pandas()
+
+        return as_columns(
+            encode_contrasts(
+                values,
+                reduced_rank=False,
+                output="pandas" if spec.output == "narwhals" else spec.output,
+                _metadata=metadata,
+                _state=encoder_state,
+                _spec=spec,
+            )
+        )
+
+    @override
+    def _get_columns_for_term(
+        self, factors: list[dict[str, Any]], spec: ModelSpec, scale: float = 1
+    ) -> dict[str, Any]:
+        out = {}
+
+        names = [
+            ":".join(reversed(product))
+            for product in itertools.product(*reversed(factors))
+        ]
+
+        # Pre-multiply factors with only one set of values (improves performance)
+        solo_factors = {}
+        indices = []
+        for i, factor in enumerate(factors):
+            if len(factor) == 1:
+                solo_factors.update(factor)
+                indices.append(i)
+        if solo_factors:
+            for index in reversed(indices):
+                factors.pop(index)
+            if spec.output == "sparse":
+                factors.append(
+                    {
+                        ":".join(solo_factors): functools.reduce(
+                            spsparse.csc_matrix.multiply, solo_factors.values()
+                        )
+                    }
+                )
+            else:
+                factors.append(
+                    {
+                        ":".join(solo_factors): functools.reduce(
+                            numpy.multiply,
+                            (numpy.asanyarray(p) for p in solo_factors.values()),
+                        )
+                    }
+                )
+
+        for i, reversed_product in enumerate(
+            itertools.product(*(factor.items() for factor in reversed(factors)))
+        ):
+            if spec.output == "sparse":
+                out[names[i]] = scale * functools.reduce(
+                    spsparse.csc_matrix.multiply,
+                    (p[1] for p in reversed(reversed_product)),
+                )
+            else:
+                out[names[i]] = scale * functools.reduce(
+                    numpy.multiply,
+                    (numpy.array(p[1]) for p in reversed(reversed_product)),
+                )
+        return out
+
+    @override
+    def _combine_columns(
+        self, cols: Sequence[tuple[str, Any]], spec: ModelSpec, drop_rows: Sequence[int]
+    ) -> pandas.DataFrame:
+        # Special case no columns to empty csc_matrix, array, or DataFrame
+        if not cols:
+            values = numpy.empty((self.data.shape[0], 0))
+            if spec.output == "sparse":
+                return spsparse.csc_matrix(values)
+            if spec.output == "narwhals":
+                # TODO: Inconsistent with non-empty case below (where we use to-native)
+                return nw.from_native(values, eager_only=True)
+            if spec.output == "numpy":
+                return values
+            return pandas.DataFrame(values)
+
+        # Otherwise, concatenate columns into model matrix
+        if spec.output == "sparse":
+            return spsparse.hstack([col[1] for col in cols])
+
+        # TODO: Can we do better than this? Having to reconstitute raw data
+        # does not seem ideal.
+        combined = nw.from_dict(
+            {name: nw.to_native(col, pass_through=True) for name, col in cols},
+            native_namespace=nw.get_native_namespace(self.__narwhals_data),
+        )
+        if spec.output == "narwhals":
+            if nw.dependencies.is_narwhals_dataframe(self.data):
+                return combined
+            return combined.to_native()
+        if spec.output == "pandas":
+            df = combined.to_pandas()
+            return df
+        if spec.output == "numpy":
+            return combined.to_numpy()
+        raise ValueError(f"Invalid output type: {spec.output}")