rapidsai
diff --git a/‎ci/test_python_other.sh
Lines changed: 8 additions & 0 deletions b/‎ci/test_python_other.sh
Lines changed: 8 additions & 0 deletions
diff --git a/‎ci/test_wheel_dask_cudf.sh
Lines changed: 9 additions & 0 deletions b/‎ci/test_wheel_dask_cudf.sh
Lines changed: 9 additions & 0 deletions
diff --git a/‎python/dask_cudf/dask_cudf/__init__.py
Lines changed: 54 additions & 8 deletions b/‎python/dask_cudf/dask_cudf/__init__.py
Lines changed: 54 additions & 8 deletions
diff --git a/‎python/dask_cudf/dask_cudf/backends.py
Lines changed: 59 additions & 4 deletions b/‎python/dask_cudf/dask_cudf/backends.py
Lines changed: 59 additions & 4 deletions
diff --git a/‎python/dask_cudf/dask_cudf/core.py
Lines changed: 15 additions & 3 deletions b/‎python/dask_cudf/dask_cudf/core.py
Lines changed: 15 additions & 3 deletions
diff --git a/‎python/dask_cudf/dask_cudf/expr/__init__.py
Lines changed: 22 additions & 0 deletions b/‎python/dask_cudf/dask_cudf/expr/__init__.py
Lines changed: 22 additions & 0 deletions
diff --git a/‎python/dask_cudf/dask_cudf/expr/_collection.py
Lines changed: 110 additions & 0 deletions b/‎python/dask_cudf/dask_cudf/expr/_collection.py
Lines changed: 110 additions & 0 deletions
@@ -29,6 +29,14 @@ rapids-logger "pytest dask_cudf"
   --cov-report=xml:"${RAPIDS_COVERAGE_DIR}/dask-cudf-coverage.xml" \
   --cov-report=term
 
+# Run tests in dask_cudf/tests and dask_cudf/io/tests with dask-expr
+rapids-logger "pytest dask_cudf + dask_expr"
+DASK_DATAFRAME__QUERY_PLANNING=True ./ci/run_dask_cudf_pytests.sh \
+  --junitxml="${RAPIDS_TESTS_DIR}/junit-dask-cudf-expr.xml" \
+  --numprocesses=8 \
+  --dist=loadscope \
+  .
+
 rapids-logger "pytest custreamz"
 ./ci/run_custreamz_pytests.sh \
   --junitxml="${RAPIDS_TESTS_DIR}/junit-custreamz.xml" \
 
@@ -38,3 +38,12 @@ python -m pytest \
   --numprocesses=8 \
   .
 popd
+
+# Run tests in dask_cudf/tests and dask_cudf/io/tests with dask-expr
+rapids-logger "pytest dask_cudf + dask_expr"
+pushd python/dask_cudf/dask_cudf
+DASK_DATAFRAME__QUERY_PLANNING=True python -m pytest \
+  --junitxml="${RAPIDS_TESTS_DIR}/junit-dask-cudf-expr.xml" \
+  --numprocesses=8 \
+  .
+popd
@@ -1,29 +1,75 @@
-# Copyright (c) 2018-2023, NVIDIA CORPORATION.
+# Copyright (c) 2018-2024, NVIDIA CORPORATION.
 
+from dask import config
+
+# For dask>2024.2.0, we can silence the loud deprecation
+# warning before importing `dask.dataframe` (this won't
+# do anything for dask==2024.2.0)
+config.set({"dataframe.query-planning-warning": False})
+
+import dask.dataframe as dd
 from dask.dataframe import from_delayed
 
 import cudf
 
 from . import backends
 from ._version import __git_commit__, __version__
-from .core import DataFrame, Series, concat, from_cudf, from_dask_dataframe
-from .groupby import groupby_agg
-from .io import read_csv, read_json, read_orc, read_text, to_orc
+from .core import concat, from_cudf, from_dask_dataframe
+from .expr import QUERY_PLANNING_ON
+
+
+def read_csv(*args, **kwargs):
+    with config.set({"dataframe.backend": "cudf"}):
+        return dd.read_csv(*args, **kwargs)
+
+
+def read_json(*args, **kwargs):
+    with config.set({"dataframe.backend": "cudf"}):
+        return dd.read_json(*args, **kwargs)
+
+
+def read_orc(*args, **kwargs):
+    with config.set({"dataframe.backend": "cudf"}):
+        return dd.read_orc(*args, **kwargs)
+
+
+def read_parquet(*args, **kwargs):
+    with config.set({"dataframe.backend": "cudf"}):
+        return dd.read_parquet(*args, **kwargs)
+
+
+def raise_not_implemented_error(attr_name):
+    def inner_func(*args, **kwargs):
+        raise NotImplementedError(
+            f"Top-level {attr_name} API is not available for dask-expr."
+        )
+
+    return inner_func
+
+
+if QUERY_PLANNING_ON:
+    from .expr._collection import DataFrame, Index, Series
+
+    groupby_agg = raise_not_implemented_error("groupby_agg")
+    read_text = raise_not_implemented_error("read_text")
+    to_orc = raise_not_implemented_error("to_orc")
+else:
+    from .core import DataFrame, Index, Series
+    from .groupby import groupby_agg
+    from .io import read_text, to_orc
 
-try:
-    from .io import read_parquet
-except ImportError:
-    pass
 
 __all__ = [
     "DataFrame",
     "Series",
+    "Index",
     "from_cudf",
     "from_dask_dataframe",
     "concat",
     "from_delayed",
 ]
 
+
 if not hasattr(cudf.DataFrame, "mean"):
     cudf.DataFrame.mean = None
 del cudf
@@ -627,13 +627,68 @@ def read_csv(*args, **kwargs):
 
     @staticmethod
     def read_hdf(*args, **kwargs):
-        from dask_cudf import from_dask_dataframe
-
         # HDF5 reader not yet implemented in cudf
         warnings.warn(
             "read_hdf is not yet implemented in cudf/dask_cudf. "
             "Moving to cudf from pandas. Expect poor performance!"
         )
-        return from_dask_dataframe(
-            _default_backend(dd.read_hdf, *args, **kwargs)
+        return _default_backend(dd.read_hdf, *args, **kwargs).to_backend(
+            "cudf"
+        )
+
+
+# Define "cudf" backend entrypoint for dask-expr
+class CudfDXBackendEntrypoint(DataFrameBackendEntrypoint):
+    """Backend-entrypoint class for Dask-Expressions
+
+    This class is registered under the name "cudf" for the
+    ``dask-expr.dataframe.backends`` entrypoint in ``setup.cfg``.
+    Dask-DataFrame will use the methods defined in this class
+    in place of ``dask_expr.<creation-method>`` when the
+    "dataframe.backend" configuration is set to "cudf":
+
+    Examples
+    --------
+    >>> import dask
+    >>> import dask_expr
+    >>> with dask.config.set({"dataframe.backend": "cudf"}):
+    ...     ddf = dx.from_dict({"a": range(10)})
+    >>> type(ddf._meta)
+    <class 'cudf.core.dataframe.DataFrame'>
+    """
+
+    @classmethod
+    def to_backend_dispatch(cls):
+        return CudfBackendEntrypoint.to_backend_dispatch()
+
+    @classmethod
+    def to_backend(cls, *args, **kwargs):
+        return CudfBackendEntrypoint.to_backend(*args, **kwargs)
+
+    @staticmethod
+    def from_dict(
+        data,
+        npartitions,
+        orient="columns",
+        dtype=None,
+        columns=None,
+        constructor=cudf.DataFrame,
+    ):
+        import dask_expr as dx
+
+        return _default_backend(
+            dx.from_dict,
+            data,
+            npartitions=npartitions,
+            orient=orient,
+            dtype=dtype,
+            columns=columns,
+            constructor=constructor,
         )
+
+
+# Import/register cudf-specific classes for dask-expr
+try:
+    import dask_cudf.expr  # noqa: F401
+except ImportError:
+    pass
@@ -685,18 +685,27 @@ def reduction(
 
 @_dask_cudf_nvtx_annotate
 def from_cudf(data, npartitions=None, chunksize=None, sort=True, name=None):
+    from dask_cudf import QUERY_PLANNING_ON
+
     if isinstance(getattr(data, "index", None), cudf.MultiIndex):
         raise NotImplementedError(
             "dask_cudf does not support MultiIndex Dataframes."
         )
 
-    name = name or ("from_cudf-" + tokenize(data, npartitions or chunksize))
+    # Dask-expr doesn't support the `name` argument
+    name = {}
+    if not QUERY_PLANNING_ON:
+        name = {
+            "name": name
+            or ("from_cudf-" + tokenize(data, npartitions or chunksize))
+        }
+
     return dd.from_pandas(
         data,
         npartitions=npartitions,
         chunksize=chunksize,
         sort=sort,
-        name=name,
+        **name,
     )
 
 
@@ -711,7 +720,10 @@ def from_cudf(data, npartitions=None, chunksize=None, sort=True, name=None):
         rather than pandas objects.\n
         """
     )
-    + textwrap.dedent(dd.from_pandas.__doc__)
+    # TODO: `dd.from_pandas.__doc__` is empty when
+    # `DASK_DATAFRAME__QUERY_PLANNING=True`
+    # since dask-expr does not provide a docstring for from_pandas.
+    + textwrap.dedent(dd.from_pandas.__doc__ or "")
 )
 
 
 
@@ -0,0 +1,22 @@
+# Copyright (c) 2024, NVIDIA CORPORATION.
+
+from dask import config
+
+# Check if dask-dataframe is using dask-expr.
+# For dask>=2024.3.0, a null value will default to True
+QUERY_PLANNING_ON = config.get("dataframe.query-planning", None) is not False
+
+# Register custom expressions and collections
+try:
+    import dask_cudf.expr._collection
+    import dask_cudf.expr._expr
+
+except ImportError as err:
+    if QUERY_PLANNING_ON:
+        # Dask *should* raise an error before this.
+        # However, we can still raise here to be certain.
+        raise RuntimeError(
+            "Failed to register the 'cudf' backend for dask-expr."
+            " Please make sure you have dask-expr installed.\n"
+            f"Error Message: {err}"
+        )
@@ -0,0 +1,110 @@
+# Copyright (c) 2024, NVIDIA CORPORATION.
+
+from functools import cached_property
+
+from dask_expr import (
+    DataFrame as DXDataFrame,
+    FrameBase,
+    Index as DXIndex,
+    Series as DXSeries,
+    get_collection_type,
+)
+from dask_expr._collection import new_collection
+from dask_expr._util import _raise_if_object_series
+
+from dask import config
+from dask.dataframe.core import is_dataframe_like
+
+import cudf
+
+##
+## Custom collection classes
+##
+
+
+# VarMixin can be removed if cudf#15179 is addressed.
+# See: https://github.com/rapidsai/cudf/issues/15179
+class VarMixin:
+    def var(
+        self,
+        axis=0,
+        skipna=True,
+        ddof=1,
+        numeric_only=False,
+        split_every=False,
+        **kwargs,
+    ):
+        _raise_if_object_series(self, "var")
+        axis = self._validate_axis(axis)
+        self._meta.var(axis=axis, skipna=skipna, numeric_only=numeric_only)
+        frame = self
+        if is_dataframe_like(self._meta) and numeric_only:
+            # Convert to pandas - cudf does something weird here
+            index = self._meta.to_pandas().var(numeric_only=True).index
+            frame = frame[list(index)]
+        return new_collection(
+            frame.expr.var(
+                axis, skipna, ddof, numeric_only, split_every=split_every
+            )
+        )
+
+
+class DataFrame(VarMixin, DXDataFrame):
+    @classmethod
+    def from_dict(cls, *args, **kwargs):
+        with config.set({"dataframe.backend": "cudf"}):
+            return DXDataFrame.from_dict(*args, **kwargs)
+
+    def groupby(
+        self,
+        by,
+        group_keys=True,
+        sort=None,
+        observed=None,
+        dropna=None,
+        **kwargs,
+    ):
+        from dask_cudf.expr._groupby import GroupBy
+
+        if isinstance(by, FrameBase) and not isinstance(by, DXSeries):
+            raise ValueError(
+                f"`by` must be a column name or list of columns, got {by}."
+            )
+
+        return GroupBy(
+            self,
+            by,
+            group_keys=group_keys,
+            sort=sort,
+            observed=observed,
+            dropna=dropna,
+            **kwargs,
+        )
+
+
+class Series(VarMixin, DXSeries):
+    def groupby(self, by, **kwargs):
+        from dask_cudf.expr._groupby import SeriesGroupBy
+
+        return SeriesGroupBy(self, by, **kwargs)
+
+    @cached_property
+    def list(self):
+        from dask_cudf.accessors import ListMethods
+
+        return ListMethods(self)
+
+    @cached_property
+    def struct(self):
+        from dask_cudf.accessors import StructMethods
+
+        return StructMethods(self)
+
+
+class Index(DXIndex):
+    pass  # Same as pandas (for now)
+
+
+get_collection_type.register(cudf.DataFrame, lambda _: DataFrame)
+get_collection_type.register(cudf.Series, lambda _: Series)
+get_collection_type.register(cudf.BaseIndex, lambda _: Index)