pandas-dev · scott-routledge2 · Dec 30, 2024 · Dec 30, 2024 · Dec 30, 2024 · Dec 30, 2024
@@ -3,9 +3,24 @@ inputs:
   environment-file:
     description: Conda environment file to use.
     default: environment.yml
+  os:
+    description: The operating system to assume when creating Conda.
+    default: not specified
 runs:
   using: composite
   steps:
+    # Remove bodo from Window's environment for now until it supports Windows.
+    - name: Remove bodo on Windows
+      if: ${{ inputs.os == 'windows-latest' }}
+      run: |
+
+        sed '/bodo/d' "$ENVIRONMENT_FILE" > tmp.txt
+        cat tmp.txt > "$ENVIRONMENT_FILE"
+        rm tmp.txt
+      env:
+        ENVIRONMENT_FILE: ${{ inputs.environment-file }}
+      shell: bash -el {0}
+
     - name: Install ${{ inputs.environment-file }}
       uses: mamba-org/setup-micromamba@v1
       with:

@@ -212,6 +212,7 @@ jobs:
         uses: ./.github/actions/setup-conda
         with:
           environment-file: ci/deps/${{ matrix.env_file }}
+          os: ${{ matrix.os }}
 
       - name: Build Pandas
         uses: ./.github/actions/build_pandas

@@ -1,6 +1,7 @@
 name: pandas-dev
 channels:
   - conda-forge
+  - bodo.ai
 dependencies:
   - python=3.10
 
@@ -35,6 +36,7 @@ dependencies:
   - lxml>=4.9.2
   - matplotlib>=3.6.3
   - numba>=0.56.4
+  - bodo>=2025.1 # [not win]
   - numexpr>=2.8.4
   - odfpy>=1.4.1
   - qtpy>=2.3.0

@@ -2,6 +2,7 @@
 name: pandas-dev
 channels:
   - conda-forge
+  - bodo.ai
 dependencies:
   - python=3.11
 
@@ -36,6 +37,7 @@ dependencies:
   - lxml>=4.9.2
   - matplotlib>=3.6.3
   - numba>=0.56.4
+  - bodo>=2025.1 # [not win]
   - numexpr>=2.8.4
   - odfpy>=1.4.1
   - qtpy>=2.3.0

@@ -1,6 +1,7 @@
 name: pandas-dev
 channels:
   - conda-forge
+  - bodo.ai
 dependencies:
   - python=3.11
 
@@ -35,6 +36,7 @@ dependencies:
   - lxml>=4.9.2
   - matplotlib>=3.6.3
   - numba>=0.56.4
+  - bodo>=2025.1 # [not win]
   - numexpr>=2.8.4
   - odfpy>=1.4.1
   - qtpy>=2.3.0

@@ -1,6 +1,7 @@
 name: pandas-dev-312
 channels:
   - conda-forge
+  - bodo.ai
 dependencies:
   - python=3.12
 
@@ -35,6 +36,7 @@ dependencies:
   - lxml>=4.9.2
   - matplotlib>=3.6.3
   - numba>=0.56.4
+  - bodo>=2025.1 # [not win]
   - numexpr>=2.8.4
   - odfpy>=1.4.1
   - qtpy>=2.3.0

@@ -11,8 +11,19 @@ COVERAGE="-s --cov=pandas --cov-report=xml --cov-append --cov-config=pyproject.t
 PYTEST_CMD="MESONPY_EDITABLE_VERBOSE=1 PYTHONDEVMODE=1 PYTHONWARNDEFAULTENCODING=1 pytest -r fE -n $PYTEST_WORKERS --dist=worksteal $TEST_ARGS $COVERAGE $PYTEST_TARGET"
 
 if [[ "$PATTERN" ]]; then
-  PYTEST_CMD="$PYTEST_CMD -m \"$PATTERN\""
+  PYTEST_CMD="$PYTEST_CMD -m \"$PATTERN and not bodo_udf_engine\""
+else
+  PYTEST_CMD="$PYTEST_CMD -m \"not bodo_udf_engine\""
 fi
 
 echo "$PYTEST_CMD"
 sh -c "$PYTEST_CMD"
+
+# Bodo tests need to be run in a separate session to prevent extensions installed conflicting with numba.
+if [[ "$PYTEST_WORKERS" == "0" ]]; then
+  # Run without setting PYTHONDEVMODE since it can cause segmentation faults during compilation.
+  PYTEST_CMD_BODO_UDF_ENGINE="MESONPY_EDITABLE_VERBOSE=1 PYTHONWARNDEFAULTENCODING=1 pytest -r fE -n $PYTEST_WORKERS --dist=worksteal $TEST_ARGS $COVERAGE $PYTEST_TARGET -m \"bodo_udf_engine\""
+  echo "Running Bodo Tests..."
+  echo $PYTEST_CMD_BODO_UDF_ENGINE
+  sh -c "$PYTEST_CMD_BODO_UDF_ENGINE"
+fi
diff --git a/doc/source/getting_started/install.rst b/doc/source/getting_started/install.rst
@@ -186,6 +186,7 @@ Dependency                                            Minimum Version    pip ext
 `numexpr <https://github.com/pydata/numexpr>`__       2.8.4              performance        Accelerates certain numerical operations by using multiple cores as well as smart chunking and caching to achieve large speedups
 `bottleneck <https://github.com/pydata/bottleneck>`__ 1.3.6              performance        Accelerates certain types of ``nan`` by using specialized cython routines to achieve large speedup.
 `numba <https://github.com/numba/numba>`__            0.56.4             performance        Alternative execution engine for operations that accept ``engine="numba"`` using a JIT compiler that translates Python functions to optimized machine code using the LLVM compiler.
+`bodo <https://github.com/bodo-ai/Bodo>`__            2025.1             performance        Alternative execution engine for operations that accept ``engine="bodo"`` using a JIT compiler that translates Python functions to optimized machine code using the LLVM compiler and automatically parallelizes uing MPI.
 ===================================================== ================== ================== ===================================================================================================================================================================================
 
 Visualization

diff --git a/doc/source/reference/testing.rst b/doc/source/reference/testing.rst
@@ -35,6 +35,7 @@ Exceptions and warnings
    errors.DtypeWarning
    errors.DuplicateLabelError
    errors.EmptyDataError
+   errors.ExecutionError
    errors.IncompatibilityWarning
    errors.IndexingError
    errors.InvalidColumnName

diff --git a/environment.yml b/environment.yml
@@ -2,6 +2,7 @@
 name: pandas-dev
 channels:
   - conda-forge
+  - bodo.ai
 dependencies:
   - python=3.10
   - pip
@@ -40,6 +41,7 @@ dependencies:
   - lxml>=4.9.2
   - matplotlib>=3.6.3
   - numba>=0.56.4
+  - bodo>=2025.1
   - numexpr>=2.8.4
   - openpyxl>=3.1.0
   - odfpy>=1.4.1

diff --git a/pandas/compat/_optional.py b/pandas/compat/_optional.py
@@ -57,6 +57,7 @@
     "tzdata": "2022.7",
     "qtpy": "2.3.0",
     "pyqt5": "5.15.9",
+    "bodo": "2025.1",
 }
 
 # A mapping from import name to package name (on PyPI) for packages where

diff --git a/pandas/core/apply.py b/pandas/core/apply.py
@@ -27,7 +27,10 @@
     npt,
 )
 from pandas.compat._optional import import_optional_dependency
-from pandas.errors import SpecificationError
+from pandas.errors import (
+    ExecutionError,
+    SpecificationError,
+)
 from pandas.util._decorators import cache_readonly
 
 from pandas.core.dtypes.cast import is_nested_object
@@ -598,9 +601,9 @@ def apply_list_or_dict_like(self) -> DataFrame | Series:
             Result when self.func is a list-like or dict-like, None otherwise.
         """
 
-        if self.engine == "numba":
+        if self.engine in ("numba", "bodo"):
             raise NotImplementedError(
-                "The 'numba' engine doesn't support list-like/"
+                f"The '{self.engine}' engine doesn't support list-like/"
                 "dict likes of callables yet."
             )
 
@@ -853,9 +856,9 @@ def apply(self) -> DataFrame | Series:
 
         # dispatch to handle list-like or dict-like
         if is_list_like(self.func):
-            if self.engine == "numba":
+            if self.engine in ("numba", "bodo"):
                 raise NotImplementedError(
-                    "the 'numba' engine doesn't support lists of callables yet"
+                    f"the '{self.engine}' engine doesn't support lists of callables yet"
                 )
             return self.apply_list_or_dict_like()
 
@@ -870,13 +873,16 @@ def apply(self) -> DataFrame | Series:
                     "the 'numba' engine doesn't support using "
                     "a string as the callable function"
                 )
+            elif self.engine == "bodo":
+                return self.apply_series_bodo()
+
             return self.apply_str()
 
         # ufunc
         elif isinstance(self.func, np.ufunc):
-            if self.engine == "numba":
+            if self.engine in ("numba", "bodo"):
                 raise NotImplementedError(
-                    "the 'numba' engine doesn't support "
+                    f"the '{self.engine}' engine doesn't support "
                     "using a numpy ufunc as the callable function"
                 )
             with np.errstate(all="ignore"):
@@ -886,9 +892,10 @@ def apply(self) -> DataFrame | Series:
 
         # broadcasting
         if self.result_type == "broadcast":
-            if self.engine == "numba":
+            if self.engine in ("numba", "bodo"):
                 raise NotImplementedError(
-                    "the 'numba' engine doesn't support result_type='broadcast'"
+                    f"the '{self.engine}' engine doesn't support "
+                    "result_type='broadcast'"
                 )
             return self.apply_broadcast(self.obj)
 
@@ -1007,6 +1014,8 @@ def wrapper(*args, **kwargs):
             result = nb_looper(self.values, self.axis, *args)
             # If we made the result 2-D, squeeze it back to 1-D
             result = np.squeeze(result)
+        elif self.engine == "bodo":
+            raise NotImplementedError("the 'bodo' engine does not support raw=True.")
         else:
             result = np.apply_along_axis(
                 wrap_function(self.func),
@@ -1051,10 +1060,17 @@ def apply_broadcast(self, target: DataFrame) -> DataFrame:
         return result
 
     def apply_standard(self):
-        if self.engine == "python":
+        if self.engine == "numba":
+            results, res_index = self.apply_series_numba()
+        elif self.engine == "bodo":
+            return self.apply_series_bodo()
+        elif self.engine == "python":
             results, res_index = self.apply_series_generator()
         else:
-            results, res_index = self.apply_series_numba()
+            raise ValueError(
+                "invalid value for engine, must be one "
+                "of {'python', 'numba', 'bodo'}"
+            )
 
         # wrap results
         return self.wrap_results(results, res_index)
@@ -1089,6 +1105,36 @@ def apply_series_numba(self):
         results = self.apply_with_numba()
         return results, self.result_index
 
+    def apply_series_bodo(self) -> DataFrame | Series:
+        if self.result_type is not None:
+            raise NotImplementedError(
+                "the 'bodo' engine does not support result_type yet."
+            )
+
+        if self.axis != 1 and not isinstance(self.func, str):
+            raise NotImplementedError(
+                "the 'bodo' engine only supports axis=1 for user-defined functions."
+            )
+
+        if self.args or self.kwargs:
+            raise NotImplementedError(
+                "the 'bodo' engine does not support passing additional args/kwargs "
+                "to apply function yet."
+            )
+
+        bodo = import_optional_dependency("bodo")
+
+        @bodo.jit(**self.engine_kwargs)
+        def do_apply(obj, func, axis):
+            return obj.apply(func, axis)
+
+        try:
+            result = do_apply(self.obj, self.func, self.axis)
+        except bodo.utils.typing.BodoError as e:
+            raise ExecutionError("Execution with engine='bodo' failed.") from e
+
+        return result
+
     def wrap_results(self, results: ResType, res_index: Index) -> DataFrame | Series:
         from pandas import Series
 

diff --git a/pandas/core/frame.py b/pandas/core/frame.py
@@ -10254,7 +10254,7 @@ def apply(
         result_type: Literal["expand", "reduce", "broadcast"] | None = None,
         args=(),
         by_row: Literal[False, "compat"] = "compat",
-        engine: Literal["python", "numba"] = "python",
+        engine: Literal["python", "numba", "bodo"] = "python",
         engine_kwargs: dict[str, bool] | None = None,
         **kwargs,
     ):
@@ -10316,7 +10316,7 @@ def apply(
 
             .. versionadded:: 2.1.0
 
-        engine : {'python', 'numba'}, default 'python'
+        engine : {'python', 'numba', 'bodo'}, default 'python'
             Choose between the python (default) engine or the numba engine in apply.
 
             The numba engine will attempt to JIT compile the passed function,
@@ -10339,6 +10339,19 @@ def apply(
             <https://numba.pydata.org/numba-doc/dev/reference/numpysupported.html>`_
             in numba to learn what you can or cannot use in the passed function.
 
+            The bodo engine will attempt to JIT compile the passed function, spawn
+            multiple workers and apply the function in parallel over the Dataframe,
+            which may result in a speedup for large DataFrames.
+
+            Bodo supports a subset of valid Python, numpy, pandas and scikit-learn.
+            Please refer to the `bodo documentation
+            <https://docs.bodo.ai/latest/api_docs/>`_ to learn more about which
+            operations and APIs are supported inside JIT compiled functions.
+
+            Code that does not have JIT support yet can still utilize Bodo's parallel
+            constructs by decorating the function with `@wrap_python
+            <https://docs.bodo.ai/latest/objmode/?h=wrap_py>`_.
+
             .. versionadded:: 2.2.0
 
         engine_kwargs : dict

diff --git a/pandas/errors/__init__.py b/pandas/errors/__init__.py
@@ -417,6 +417,21 @@ class NumbaUtilError(Exception):
     """
 
 
+class ExecutionError(Exception):
+    """
+    Error raised from internal errors originating in engines.
+
+    Examples
+    --------
+    >>> df = pd.DataFrame({"A": [1, 2, 3], "B": ["1", "2", "3"]})
+    >>> df.apply(lambda x: x.A + x.B, engine="bodo", axis=1)
+    Traceback (most recent call last):
+        ...
+    pandas.errors.ExecutionError: Execution with engine='bodo' failed.
+
+    """
+
+
 class DuplicateLabelError(ValueError):
     """
     Error raised when an operation would introduce duplicate labels.
@@ -916,6 +931,7 @@ class InvalidComparison(Exception):
     "DtypeWarning",
     "DuplicateLabelError",
     "EmptyDataError",
+    "ExecutionError",
     "IncompatibilityWarning",
     "IndexingError",
     "IntCastingNaNError",