@@ -10275,7 +10275,7 @@ def apply(
10275
10275
result_type : Literal ["expand" , "reduce" , "broadcast" ] | None = None ,
10276
10276
args = (),
10277
10277
by_row : Literal [False , "compat" ] = "compat" ,
10278
- engine : Literal ["python" , "numba" ] = "python" ,
10278
+ engine : Callable | None | Literal ["python" , "numba" ] = None ,
10279
10279
engine_kwargs : dict [str , bool ] | None = None ,
10280
10280
** kwargs ,
10281
10281
):
@@ -10339,35 +10339,32 @@ def apply(
10339
10339
10340
10340
.. versionadded:: 2.1.0
10341
10341
10342
- engine : {'python', 'numba'}, default 'python'
10343
- Choose between the python (default) engine or the numba engine in apply.
10342
+ engine : decorator or {'python', 'numba'}, optional
10343
+ Choose the execution engine to use. If not provided the function
10344
+ will be executed by the regular Python interpreter.
10344
10345
10345
- The numba engine will attempt to JIT compile the passed function,
10346
- which may result in speedups for large DataFrames.
10347
- It also supports the following engine_kwargs :
10346
+ Other options include JIT compilers such Numba and Bodo, which in some
10347
+ cases can speed up the execution. To use an executor you can provide
10348
+ the decorators ``numba.jit``, ``numba.njit`` or ``bodo.jit``. You can
10349
+ also provide the decorator with parameters, like ``numba.jit(nogit=True)``.
10348
10350
10349
- - nopython (compile the function in nopython mode)
10350
- - nogil (release the GIL inside the JIT compiled function)
10351
- - parallel (try to apply the function in parallel over the DataFrame)
10351
+ Not all functions can be executed with all execution engines. In general,
10352
+ JIT compilers will require type stability in the function (no variable
10353
+ should change data type during the execution). And not all pandas and
10354
+ NumPy APIs are supported. Check the engine documentation [1]_ and [2]_
10355
+ for limitations.
10352
10356
10353
- Note: Due to limitations within numba/how pandas interfaces with numba,
10354
- you should only use this if raw=True
10355
-
10356
- Note: The numba compiler only supports a subset of
10357
- valid Python/numpy operations.
10357
+ .. warning::
10358
10358
10359
- Please read more about the `supported python features
10360
- <https://numba.pydata.org/numba-doc/dev/reference/pysupported.html>`_
10361
- and `supported numpy features
10362
- <https://numba.pydata.org/numba-doc/dev/reference/numpysupported.html>`_
10363
- in numba to learn what you can or cannot use in the passed function.
10359
+ String parameters will stop being supported in a future pandas version.
10364
10360
10365
10361
.. versionadded:: 2.2.0
10366
10362
10367
10363
engine_kwargs : dict
10368
10364
Pass keyword arguments to the engine.
10369
10365
This is currently only used by the numba engine,
10370
10366
see the documentation for the engine argument for more information.
10367
+
10371
10368
**kwargs
10372
10369
Additional keyword arguments to pass as keywords arguments to
10373
10370
`func`.
@@ -10390,6 +10387,13 @@ def apply(
10390
10387
behavior or errors and are not supported. See :ref:`gotchas.udf-mutation`
10391
10388
for more details.
10392
10389
10390
+ References
10391
+ ----------
10392
+ .. [1] `Numba documentation
10393
+ <https://numba.readthedocs.io/en/stable/index.html>`_
10394
+ .. [2] `Bodo documentation
10395
+ <https://docs.bodo.ai/latest/>`/
10396
+
10393
10397
Examples
10394
10398
--------
10395
10399
>>> df = pd.DataFrame([[4, 9]] * 3, columns=["A", "B"])
@@ -10458,22 +10462,99 @@ def apply(
10458
10462
0 1 2
10459
10463
1 1 2
10460
10464
2 1 2
10465
+
10466
+ Advanced users can speed up their code by using a Just-in-time (JIT) compiler
10467
+ with ``apply``. The main JIT compilers available for pandas are Numba and Bodo.
10468
+ In general, JIT compilation is only possible when the function passed to
10469
+ ``apply`` has type stability (variables in the function do not change their
10470
+ type during the execution).
10471
+
10472
+ >>> import bodo
10473
+ >>> df.apply(lambda x: x.A + x.B, axis=1, engine=bodo.jit)
10474
+
10475
+ Note that JIT compilation is only recommended for functions that take a
10476
+ significant amount of time to run. Fast functions are unlikely to run faster
10477
+ with JIT compilation.
10461
10478
"""
10462
- from pandas .core .apply import frame_apply
10479
+ if engine is None or isinstance (engine , str ):
10480
+ from pandas .core .apply import frame_apply
10463
10481
10464
- op = frame_apply (
10465
- self ,
10466
- func = func ,
10467
- axis = axis ,
10468
- raw = raw ,
10469
- result_type = result_type ,
10470
- by_row = by_row ,
10471
- engine = engine ,
10472
- engine_kwargs = engine_kwargs ,
10473
- args = args ,
10474
- kwargs = kwargs ,
10475
- )
10476
- return op .apply ().__finalize__ (self , method = "apply" )
10482
+ if engine is None :
10483
+ engine = "python"
10484
+
10485
+ if engine not in ["python" , "numba" ]:
10486
+ raise ValueError (f"Unknown engine '{ engine } '" )
10487
+
10488
+ op = frame_apply (
10489
+ self ,
10490
+ func = func ,
10491
+ axis = axis ,
10492
+ raw = raw ,
10493
+ result_type = result_type ,
10494
+ by_row = by_row ,
10495
+ engine = engine ,
10496
+ engine_kwargs = engine_kwargs ,
10497
+ args = args ,
10498
+ kwargs = kwargs ,
10499
+ )
10500
+ return op .apply ().__finalize__ (self , method = "apply" )
10501
+ elif hasattr (engine , "__pandas_udf__" ):
10502
+ if result_type is not None :
10503
+ raise NotImplementedError (
10504
+ f"{ result_type = } only implemented for the default engine"
10505
+ )
10506
+
10507
+ agg_axis = self ._get_agg_axis (self ._get_axis_number (axis ))
10508
+
10509
+ # one axis is empty
10510
+ if not all (self .shape ):
10511
+ func = cast (Callable , func )
10512
+ try :
10513
+ if axis == 0 :
10514
+ r = func (Series ([], dtype = np .float64 ), * args , ** kwargs )
10515
+ else :
10516
+ r = func (
10517
+ Series (index = self .columns , dtype = np .float64 ),
10518
+ * args ,
10519
+ ** kwargs ,
10520
+ )
10521
+ except Exception :
10522
+ pass
10523
+ else :
10524
+ if not isinstance (r , Series ):
10525
+ if len (agg_axis ):
10526
+ r = func (Series ([], dtype = np .float64 ), * args , ** kwargs )
10527
+ else :
10528
+ r = np .nan
10529
+
10530
+ return self ._constructor_sliced (r , index = agg_axis )
10531
+ return self .copy ()
10532
+
10533
+ data : DataFrame | np .ndarray = self
10534
+ if raw :
10535
+ # This will upcast the whole DataFrame to the same type,
10536
+ # and likely result in an object 2D array.
10537
+ # We should probably pass a list of 1D arrays instead, at
10538
+ # lest for ``axis=0``
10539
+ data = self .values
10540
+ result = engine .__pandas_udf__ .apply (
10541
+ data = data ,
10542
+ func = func ,
10543
+ args = args ,
10544
+ kwargs = kwargs ,
10545
+ decorator = engine ,
10546
+ axis = axis ,
10547
+ )
10548
+ if raw :
10549
+ if result .ndim == 2 :
10550
+ return self ._constructor (
10551
+ result , index = self .index , columns = self .columns
10552
+ )
10553
+ else :
10554
+ return self ._constructor_sliced (result , index = agg_axis )
10555
+ return result
10556
+ else :
10557
+ raise ValueError (f"Unknown engine { engine } " )
10477
10558
10478
10559
def map (
10479
10560
self , func : PythonFuncType , na_action : Literal ["ignore" ] | None = None , ** kwargs
@@ -10590,9 +10671,11 @@ def _append(
10590
10671
10591
10672
index = Index (
10592
10673
[other .name ],
10593
- name = self .index .names
10594
- if isinstance (self .index , MultiIndex )
10595
- else self .index .name ,
10674
+ name = (
10675
+ self .index .names
10676
+ if isinstance (self .index , MultiIndex )
10677
+ else self .index .name
10678
+ ),
10596
10679
)
10597
10680
row_df = other .to_frame ().T
10598
10681
# infer_objects is needed for
0 commit comments