diff --git a/python/paddle/__init__.py b/python/paddle/__init__.py
index 9bfc7c9d91791..1414f9490d686 100644
--- a/python/paddle/__init__.py
+++ b/python/paddle/__init__.py
@@ -234,7 +234,8 @@ def new_init(self, *args, **kwargs):
     set_grad_enabled,
 )
 from .device import (  # noqa: F401
-    PaddleStream as Stream,
+    Event,
+    Stream,
     device_guard,
     get_cudnn_version,
     get_default_device,
@@ -247,6 +248,7 @@ def new_init(self, *args, **kwargs):
     is_compiled_with_ipu,
     is_compiled_with_rocm,
     is_compiled_with_xpu,
+    set_default_device,
     set_device,
 )
 from .distributed import DataParallel
@@ -933,6 +935,7 @@ def __dir__(self):
                         raise err
             kernel32.SetErrorMode(prev_error_mode)
 
+
 disable_static()
 
 from .pir_utils import IrGuard
diff --git a/python/paddle/cuda/__init__.py b/python/paddle/cuda/__init__.py
index 650df07b77c87..277f2d3248626 100644
--- a/python/paddle/cuda/__init__.py
+++ b/python/paddle/cuda/__init__.py
@@ -21,11 +21,17 @@
 import paddle
 from paddle import base, core, device as paddle_device, framework
 from paddle.device import (
-    PaddleStream as Stream,
+    Event,
+    Stream,
     _device_to_paddle as _device_to_paddle,
+    device,
     is_available as _device_is_available,
+    is_bf16_supported,
     is_current_stream_capturing as _is_current_stream_capturing,
+    manual_seed,
     manual_seed_all as device_manual_seed_all,
+    reset_peak_memory_stats,
+    set_stream,
     stream_guard as _PaddleStreamGuard,
 )
 
@@ -644,6 +650,109 @@ def memory_allocated(device: DeviceLike = None) -> int:
     return paddle_device.memory_allocated(device)
 
 
+def max_memory_allocated(device: DeviceLike = None) -> int:
+    '''
+    Return the peak size of memory that is allocated to tensor of the given device.
+
+    Note:
+        The size of memory allocated to tensor is 256-byte aligned in Paddle, which may larger than the memory size that tensor actually need.
+        For instance, a float32 0-D Tensor with shape [] will take up 256 bytes memory, even though storing a float32 data requires only 4 bytes.
+
+    Args:
+        device(paddle.CUDAPlace|int|str|None, optional): The device, the id of the device or
+            the string name of device like 'gpu:x'. If device is None, the device is the current device.
+            Default: None.
+
+    Return:
+        int: The peak size of memory that is allocated to tensor of the given device, in bytes.
+
+    Examples:
+        .. code-block:: python
+
+            >>> # doctest: +REQUIRES(env:GPU)
+            >>> import paddle
+            >>> paddle.device.set_device('gpu')  # or '<custom_device>'
+
+            >>> max_memory_allocated_size = paddle.cuda.max_memory_allocated(paddle.CUDAPlace(0))
+            >>> max_memory_allocated_size = paddle.cuda.max_memory_allocated(0)
+            >>> max_memory_allocated_size = paddle.cuda.max_memory_allocated("gpu:0")
+    '''
+    return paddle_device.max_memory_allocated(device)
+
+
+def max_memory_reserved(device: DeviceLike = None) -> int:
+    '''
+    Return the peak size of memory that is held by the allocator of the given device.
+
+    Args:
+        device(paddle.Place|int|str|None, optional): The device, the id of the device or
+            the string name of device like 'gpu:x'. If device is None, the device is the current device.
+            Default: None.
+
+    Return:
+        int: The peak size of memory that is held by the allocator of the given device, in bytes.
+
+    Examples:
+        .. code-block:: python
+
+            >>> # doctest: +REQUIRES(env:GPU)
+            >>> import paddle
+            >>> paddle.device.set_device('gpu')  # or '<custom_device>'
+
+            >>> max_memory_reserved_size = paddle.cuda.max_memory_reserved(paddle.CUDAPlace(0))
+            >>> max_memory_reserved_size = paddle.cuda.max_memory_reserved(0)
+            >>> max_memory_reserved_size = paddle.cuda.max_memory_reserved("gpu:0")
+    '''
+    return paddle_device.max_memory_reserved(device)
+
+
+def reset_max_memory_allocated(device: DeviceLike | None = None) -> None:
+    '''
+    Reset the peak size of memory that is allocated to tensor of the given device.
+
+    Args:
+        device(paddle.Place|int|str|None, optional): The device, the id of the device or
+            the string name of device like 'gpu:x'. If device is None, the device is the current device.
+            Default: None.
+
+    Examples:
+        .. code-block:: python
+
+            >>> # doctest: +REQUIRES(env:GPU)
+            >>> import paddle
+            >>> paddle.device.set_device('gpu')  # or '<custom_device>'
+
+            >>> paddle.cuda.reset_max_memory_allocated(paddle.CUDAPlace(0))
+            >>> paddle.cuda.reset_max_memory_allocated(0)
+            >>> paddle.cuda.reset_max_memory_allocated("gpu:0")
+    '''
+
+    return paddle_device.reset_max_memory_allocated(device)
+
+
+def reset_max_memory_reserved(device: DeviceLike | None = None) -> None:
+    '''
+    Reset the peak size of memory that is held by the allocator of the given device.
+
+    Args:
+        device(paddle.Place|int|str|None, optional): The device, the id of the device or
+            the string name of device like 'gpu:x'. If device is None, the device is the current device.
+            Default: None.
+
+    Examples:
+        .. code-block:: python
+
+            >>> # doctest: +REQUIRES(env:GPU)
+            >>> import paddle
+            >>> paddle.device.set_device('gpu')  # or '<custom_device>'
+
+            >>> paddle.cuda.reset_max_memory_reserved(paddle.CUDAPlace(0))
+            >>> paddle.cuda.reset_max_memory_reserved(0)
+            >>> paddle.cuda.reset_max_memory_reserved("gpu:0")
+    '''
+    return paddle_device.reset_max_memory_reserved(device)
+
+
 def memory_reserved(device: DeviceLike = None) -> int:
     """
     Return the current device memory managed by the caching allocator in bytes for a given device.
@@ -796,7 +905,14 @@ def get_stream_from_external(
     "memory_allocated",
     "memory_reserved",
     "set_device",
+    "set_stream",
     "manual_seed_all",
     "get_rng_state",
     "set_rng_state",
+    "device",
+    "is_bf16_supported",
+    "manual_seed",
+    "max_memory_allocated",
+    "reset_peak_memory_stats",
+    "Event",
 ]
diff --git a/python/paddle/device/__init__.py b/python/paddle/device/__init__.py
index 5d79c5bd07a81..6450ca62813d9 100644
--- a/python/paddle/device/__init__.py
+++ b/python/paddle/device/__init__.py
@@ -73,6 +73,7 @@
         empty_cache,
         get_device_properties as _get_device_properties,
         get_rng_state,
+        manual_seed,
         max_memory_allocated,
         max_memory_reserved,
         memory_allocated,
@@ -88,6 +89,7 @@
         device_count,
         empty_cache,
         get_rng_state,
+        manual_seed,
         max_memory_allocated,
         max_memory_reserved,
         memory_allocated,
@@ -109,6 +111,7 @@
             empty_cache,
             get_device_properties as _get_device_properties,
             get_rng_state,
+            manual_seed,
             max_memory_allocated,
             max_memory_reserved,
             memory_allocated,
@@ -122,6 +125,11 @@
         from .cpu import (
             device_count,
             get_rng_state,
+            manual_seed,
+            max_memory_allocated,
+            max_memory_reserved,
+            reset_max_memory_allocated,
+            reset_max_memory_reserved,
             set_rng_state,
         )
 
@@ -165,6 +173,10 @@
     'get_device_capability',
     'get_rng_state',
     'set_rng_state',
+    'device',
+    'is_bf16_supported',
+    'manual_seed',
+    'reset_peak_memory_stats',
 ]
 
 _cudnn_version = None
@@ -442,7 +454,72 @@ def _convert_to_place(device: PlaceLike) -> Place:
     return place
 
 
-def set_device(device: str) -> PlaceLike:
+class device:
+    r"""Context-manager that changes the selected device.
+
+    Args:
+        device (paddle.Place, int or str): device index to select.
+
+    Examples:
+        .. code-block:: python
+            >>> import paddle
+
+            >>> print(paddle.device.get_device())  # gpu:0
+            >>> with paddle.device.device("cpu"):
+            ...     print(paddle.device.get_device())  # cpu
+
+            >>> # paddle.cuda.device is an alias of paddle.device.device
+            >>> with paddle.cuda.device("cpu"):
+            ...     print(paddle.device.get_device())  # cpu
+            >>> print(paddle.device.get_device())
+    """
+
+    def __init__(self, device: Place | int | str | None = None):
+        self.place = device_to_place(device)
+        self.prev_place_str = "-1"
+
+    def __enter__(self):
+        self.prev_place_str = get_device()
+        set_device(self.place)
+
+    def __exit__(
+        self,
+        exc_type: type[BaseException] | None,
+        exc_value: BaseException | None,
+        traceback: types.TracebackType | None,
+    ) -> bool | None:
+        set_device(self.prev_place_str)
+        return False
+
+
+def is_bf16_supported(including_emulation: bool = True) -> bool:
+    """
+    Return a bool indicating if the current CUDA/ROCm device supports dtype bfloat16.
+
+    Args:
+        including_emulation (bool = True): Whether to treat software-emulated BF16 as supported; if False, only native hardware BF16 support is considered.
+
+    Returns:
+        bool: A boolean value which indicates whether the current CUDA/ROCm device supports dtype bfloat16.
+
+    Examples:
+
+        .. code-block:: python
+
+            >>> import paddle
+
+            >>> paddle.device.is_bf16_supported()
+            >>> # paddle.cuda.is_bf16_supported() is an alias of paddle.device.is_bf16_supported()
+            >>> paddle.cuda.is_bf16_supported()
+
+    """
+    # including_emulation is not used here, but kept for compatibility with the original implementation
+    return core.is_bfloat16_supported(
+        paddle.framework._current_expected_place()
+    )
+
+
+def set_device(device: PlaceLike | int) -> PlaceLike:
     """
 
     Paddle supports running calculations on various types of devices, including CPU, GPU, XPU, NPU and IPU.
@@ -450,7 +527,7 @@ def set_device(device: str) -> PlaceLike:
     which the OP will run.
 
     Args:
-        device(str): This parameter determines the specific running device.
+        device(str, Place or int): This parameter determines the specific running device.
             It can be ``cpu``, ``gpu``, ``xpu``, ``npu``, ``gpu:x``, ``xpu:x``, ``npu:x`` and ``ipu``,
             where ``x`` is the index of the GPUs, XPUs or NPUs.
 
@@ -469,12 +546,12 @@ def set_device(device: str) -> PlaceLike:
             >>> data = paddle.stack([x1,x2], axis=1)
 
     """
-    place = _convert_to_place(device)
+    place = device_to_place(device)
     framework._set_expected_place(place)
     return place
 
 
-def get_device() -> str:
+def get_device(input: paddle.Tensor = None) -> str | int:
     """
 
     This function can get the current global device of the program is running.
@@ -482,6 +559,18 @@ def get_device() -> str:
     set, it will return a string which is 'gpu:x' when cuda is available or it
     will return a string which is 'cpu' when cuda is not available.
 
+    Returns:
+        if input is Tensor, this function will return the device ID where the given Tensor is located.
+        int:
+            - -1, if the Tensor is on CPU.
+            - The device ID (e.g., 0, 1, ...) if the Tensor is on GPU.
+
+        if input is not Tensor, this function will return the device name where the program is running.
+        str:
+            - 'cpu': If the program is running on CPU.
+            - 'gpu:x': If the program is running on GPU, where `x` is the index of the GPU.
+            - 'xpu:x': If the program is running on XPU, where `x` is the index of the XPU.
+            - 'npu:x': If the program is running on NPU, where `x` is the index of
     Examples:
 
         .. code-block:: python
@@ -489,7 +578,16 @@ def get_device() -> str:
             >>> import paddle
             >>> device = paddle.device.get_device()
 
+            >>> x_cpu = paddle.to_tensor([1, 2, 3], place=paddle.CPUPlace())
+            >>> id = paddle.get_device(x_cpu) # -1
+
+
+
     """
+    if isinstance(input, paddle.Tensor):
+        if 'cpu' in str(input.place):
+            return -1
+        return input.place.gpu_device_id()
     device = ''
     place = framework._current_expected_place_()
     if isinstance(place, core.CPUPlace):
@@ -525,6 +623,25 @@ def get_default_device() -> paddle.device:
     return paddle.device(get_device().replace("gpu", "cuda"))
 
 
+def set_default_device(device: PlaceLike | int) -> None:
+    """
+    Paddle supports running calculations on various types of devices, including CPU, GPU, XPU, NPU and IPU.
+    This function can specify the global device which the OP will run.
+
+    Args:
+        device(str, Place or int): This parameter determines the specific running device.
+            It can be ``cpu``, ``gpu``, ``xpu``, ``npu``, ``gpu:x``, ``xpu:x``, ``npu:x`` and ``ipu``,
+            where ``x`` is the index of the GPUs, XPUs or NPUs.
+
+    Examples:
+        .. code-block:: python
+
+            >>> import paddle
+            >>> paddle.device.set_device("cpu")
+    """
+    set_device(device)
+
+
 def get_all_device_type() -> list[str]:
     """
 
@@ -1049,13 +1166,14 @@ class Stream:
     '''
 
     A device stream wrapper around StreamBase.
+    paddle.cuda.Stream() is equivalent to paddle.device.Stream().
 
     Args:
         device(str|paddle.CUDAPlace(n)|paddle.CustomPlace(n)|None): Which device the stream run on. If device is None, the device is the current device. Default: None.
             It can be ``gpu``, ``gpu:x``, ``custom_device``, ``custom_device:x``, where ``custom_device`` is the name of CustomDevice,
             where ``x`` is the index of the GPUs, XPUs. And it can be paddle.CUDAPlace(n) or paddle.CustomPlace(n).
         priority(int, optional): priority of the CUDA stream. Can be either
-            1 (high priority) or 2 (low priority). By default, streams have
+            1 or -1 (high priority) or 0 or 2 (low priority). By default, streams have
             priority 2.
 
     Returns:
@@ -1076,11 +1194,12 @@ class Stream:
     '''
 
     stream_base: _InitStreamBase
-    device: PlaceLike
+    device: PlaceLike | int
+    _priority_map: dict[int, int] = {-1: 1, 0: 2, 1: 1, 2: 2}
 
     def __init__(
         self,
-        device: PlaceLike | None = None,
+        device: PlaceLike | int | None = None,
         priority: int = 2,
         stream_base: _InitStreamBase | None = None,
     ) -> None:
@@ -1096,13 +1215,7 @@ def __init__(
                     "stream_base should be CUDAStream, XPUStream, CustomDeviceStream"
                 )
             return
-
-        if device is None:
-            self.device = paddle.framework._current_expected_place_()
-        elif isinstance(device, str):
-            self.device = paddle.device._convert_to_place(device)
-        else:
-            self.device = device
+        self.device = device_to_place(device)
 
         device_id = (
             self.device.get_device_id()
@@ -1114,7 +1227,7 @@ def __init__(
             if hasattr(self.device, 'get_device_type')
             else None
         )
-
+        priority = self._priority_map.get(priority, 2)
         self.stream_base = _create_stream_base(
             device_id=device_id,
             priority=priority,
@@ -1296,40 +1409,6 @@ def _device_to_paddle(
         return dev
 
 
-class PaddleStream(Stream):
-    """Wrapper class for Paddle CUDA/XPU Stream, supporting standard device/priority handling.
-
-    This class inherits from the base `Stream` (renamed to `StreamBase` to avoid naming conflict)
-    and adds:
-    1. Unified device string conversion via `_device_to_paddle`
-    2. Priority mapping for user-friendly priority values
-    3. Clear parameter validation and error handling
-
-    Attributes:
-        _priority_map (dict[int, int]): Mapping from user-facing priority values to Paddle internal priority codes.
-            - User input: -1 (high priority), 0/2 (low priority), 1 (high priority)
-            - Internal code: 1 (high), 2 (low)
-    """
-
-    _priority_map: dict[int, int] = {-1: 1, 0: 2, 1: 1, 2: 2}
-
-    def __init__(
-        self,
-        device: paddle.CUDAPlace | paddle.CustomPlace | int | str | None = None,
-        priority: int = 0,
-        *args,
-        **kwargs,
-    ):
-        paddle_device = _device_to_paddle(device)
-        paddle_priority = self._priority_map.get(priority, 2)
-        super().__init__(
-            device=paddle_device,
-            priority=paddle_priority,
-            *args,
-            **kwargs,
-        )
-
-
 def current_stream(device: PlaceLike | None = None) -> Stream:
     '''
 
@@ -1404,6 +1483,7 @@ def set_stream(stream: Stream) -> Stream:
 
             >>> paddle.set_device('custom_cpu')
             >>> s = paddle.device.Stream()
+            >>> # paddle.cuda.set_stream(s) is equivalent to paddle.device.set_stream(s)
             >>> paddle.device.set_stream(s)
 
     '''
@@ -1675,6 +1755,27 @@ def manual_seed_all(seed: int) -> None:
     paddle.seed(seed)
 
 
+def reset_peak_memory_stats(device: PlaceLike | int | None = None) -> None:
+    """
+    Resets all devices' peak memory statistics.
+
+    This method resets the peak memory usage recorded for each device during the execution of the program.
+    It sets the peak memory usage back to zero for all devices.
+
+    Example:
+        >>> # doctest: +REQUIRES(env:GPU)
+        >>> import paddle
+        >>> paddle.device.set_device('gpu')  # or '<custom_device>'
+
+        >>> # paddle.cuda.reset_max_memory_allocated() is equivalent to paddle.device.reset_max_memory_allocated()
+
+        >>> paddle.device.reset_max_memory_allocated(paddle.CUDAPlace(0))
+        >>> paddle.device.reset_max_memory_allocated(0)
+        >>> paddle.device.reset_max_memory_allocated("gpu:0")
+    """
+    reset_max_memory_allocated()
+
+
 class Device(str):
     """
     Paddle computing device.
diff --git a/python/paddle/device/cpu.py b/python/paddle/device/cpu.py
index c9706a812733d..af7914f7fd44a 100644
--- a/python/paddle/device/cpu.py
+++ b/python/paddle/device/cpu.py
@@ -107,3 +107,75 @@ def set_rng_state(
             >>> paddle.device.set_rng_state(state)
     """
     core.default_cpu_generator().set_state(new_state)
+
+
+def manual_seed(seed: int) -> None:
+    r"""Set the seed for generating random numbers for the current Device.
+
+    .. warning::
+        If you are working with a multi-Device model, this function is insufficient
+        to get determinism.  To seed all Devices, use :func:`manual_seed_all`.
+
+    Sets the seed for global default generator, which manages the random number generation.
+
+    Args:
+        seed(int): The random seed to set.
+
+    Returns:
+        None
+
+    Examples:
+        .. code-block:: python
+
+            >>> import paddle
+            >>> paddle.device.manual_seed(102)
+            >>> # paddle.cuda.manual_seed(102) is equivalent to paddle.device.manual_seed(102)
+            >>> paddle.cuda.manual_seed(102)
+
+    """
+    seed = int(seed)
+    core.default_cpu_generator().manual_seed(seed)
+
+
+def max_memory_allocated(device: _CPUPlaceLike | None = None) -> int:
+    r"""
+    The API max_memory_allocated is not supported in CPU PaddlePaddle.
+    Please reinstall PaddlePaddle with GPU or XPU support to call this API.
+    """
+    raise ValueError(
+        "The API paddle.device.max_memory_allocated is not supported in CPU PaddlePaddle. "
+        "Please reinstall PaddlePaddle with GPU or XPU support to call this API."
+    )
+
+
+def max_memory_reserved(device: _CPUPlaceLike | None = None) -> int:
+    r"""
+    The API max_memory_reserved is not supported in CPU PaddlePaddle.
+    Please reinstall PaddlePaddle with GPU or XPU support to call this API.
+    """
+    raise ValueError(
+        "The API paddle.device.max_memory_reserved is not supported in CPU PaddlePaddle. "
+        "Please reinstall PaddlePaddle with GPU or XPU support to call this API."
+    )
+
+
+def reset_max_memory_allocated(device: _CPUPlaceLike | None = None) -> None:
+    r"""
+    The API reset_max_memory_allocated is not supported in CPU PaddlePaddle.
+    Please reinstall PaddlePaddle with GPU or XPU support to call this API.
+    """
+    raise ValueError(
+        "The API paddle.device.reset_max_memory_allocated is not supported in CPU PaddlePaddle. "
+        "Please reinstall PaddlePaddle with GPU or XPU support to call this API."
+    )
+
+
+def reset_max_memory_reserved(device: _CPUPlaceLike | None = None) -> None:
+    r"""
+    The API reset_max_memory_reserved is not supported in CPU PaddlePaddle.
+    Please reinstall PaddlePaddle with GPU or XPU support to call this API.
+    """
+    raise ValueError(
+        "The API paddle.device.reset_max_memory_reserved is not supported in CPU PaddlePaddle. "
+        "Please reinstall PaddlePaddle with GPU or XPU support to call this API."
+    )
diff --git a/python/paddle/device/cuda/__init__.py b/python/paddle/device/cuda/__init__.py
index 3bc294527f21a..ceaf180451b19 100644
--- a/python/paddle/device/cuda/__init__.py
+++ b/python/paddle/device/cuda/__init__.py
@@ -776,3 +776,36 @@ def set_rng_state(
         core.default_cpu_generator().set_state(new_state)
     else:
         core.default_cuda_generator(place.get_device_id()).set_state(new_state)
+
+
+def manual_seed(seed: int) -> None:
+    """Set the seed for generating random numbers for the current Device.
+
+    .. warning::
+        If you are working with a multi-Device model, this function is insufficient
+        to get determinism.  To seed all Devices, use :func:`manual_seed_all`.
+        If current Device is CPU, this function will set the seed of the default CPU generator.
+
+    Sets the seed for global default generator, which manages the random number generation.
+
+    Args:
+        seed(int): The random seed to set.
+
+    Returns:
+        None
+
+    Examples:
+        .. code-block:: python
+            >>> # doctest: +REQUIRES(env:CUSTOM_DEVICE)
+            >>> import paddle
+            >>> paddle.device.manual_seed(102)
+            >>> # paddle.cuda.manual_seed(102) is equivalent to paddle.device.manual_seed(102)
+            >>> paddle.cuda.manual_seed(102)
+
+    """
+    seed = int(seed)
+    place = paddle.framework._current_expected_place_()
+    if isinstance(place, core.CPUPlace):
+        core.default_cpu_generator().manual_seed(seed)
+    else:
+        core.default_cuda_generator(place.get_device_id()).manual_seed(seed)
diff --git a/python/paddle/device/custom_device.py b/python/paddle/device/custom_device.py
index 06b631f48cc1a..7075f60209582 100644
--- a/python/paddle/device/custom_device.py
+++ b/python/paddle/device/custom_device.py
@@ -570,3 +570,35 @@ def set_rng_state(
         core.default_cpu_generator().set_state(new_state)
     else:
         core.default_custom_device_generator(place).set_state(new_state)
+
+
+def manual_seed(seed: int) -> None:
+    r"""Set the seed for generating random numbers for the current Device.
+
+    .. warning::
+        If you are working with a multi-Device model, this function is insufficient
+        to get determinism.  To seed all Devices, use :func:`manual_seed_all`.
+        If current Device is CPU, this function will set the seed of the default CPU generator.
+
+    Sets the seed for global default generator, which manages the random number generation.
+
+    Args:
+        seed(int): The random seed to set.
+
+    Returns:
+        None
+
+    Examples:
+        .. code-block:: python
+            >>> # doctest: +REQUIRES(env:CUSTOM_DEVICE)
+            >>> import paddle
+            >>> paddle.device.manual_seed(102)
+            >>> # paddle.cuda.manual_seed(102) is equivalent to paddle.device.manual_seed(102)
+
+    """
+    seed = int(seed)
+    place = paddle.framework._current_expected_place()
+    if isinstance(place, core.CPUPlace):
+        core.default_cpu_generator().manual_seed(seed)
+    else:
+        core.default_custom_device_generator(place).manual_seed(seed)
diff --git a/python/paddle/device/xpu/__init__.py b/python/paddle/device/xpu/__init__.py
index 8f585658a3472..15563a640aa97 100644
--- a/python/paddle/device/xpu/__init__.py
+++ b/python/paddle/device/xpu/__init__.py
@@ -579,3 +579,36 @@ def set_rng_state(
         core.default_cpu_generator().set_state(new_state)
     else:
         core.default_xpu_generator(place.get_device_id()).set_state(new_state)
+
+
+def manual_seed(seed: int) -> None:
+    r"""Set the seed for generating random numbers for the current Device.
+
+    .. warning::
+        If you are working with a multi-Device model, this function is insufficient
+        to get determinism.  To seed all Devices, use :func:`manual_seed_all`.
+        If current Device is CPU, this function will set the seed of the default CPU generator.
+
+    Sets the seed for global default generator, which manages the random number generation.
+
+    Args:
+        seed(int): The random seed to set.
+
+    Returns:
+        None
+
+    Examples:
+        .. code-block:: python
+            >>> # doctest: +REQUIRES(env:XPU)
+            >>> import paddle
+            >>> paddle.device.manual_seed(102)
+            >>> # paddle.cuda.manual_seed(102) is equivalent to paddle.device.manual_seed(102)
+            >>> paddle.cuda.manual_seed(102)
+
+    """
+    seed = int(seed)
+    place = paddle.framework._current_expected_place_()
+    if isinstance(place, core.CPUPlace):
+        core.default_cpu_generator().manual_seed(seed)
+    else:
+        core.default_xpu_generator(place.get_device_id()).manual_seed(seed)
diff --git a/test/compat/test_device_apis.py b/test/compat/test_device_apis.py
index 04a499aa3173e..894241564edf4 100644
--- a/test/compat/test_device_apis.py
+++ b/test/compat/test_device_apis.py
@@ -27,6 +27,51 @@ def is_custom_device():
     return False
 
 
+def only_has_cpu():
+    return (
+        not core.is_compiled_with_cuda()
+        and not core.is_compiled_with_xpu()
+        and not is_custom_device()
+    )
+
+
+class TestErrorCPU(unittest.TestCase):
+    def test_max_memory_allocated_raises_on_cpu(self):
+        if only_has_cpu():
+            with self.assertRaisesRegex(
+                ValueError, "not supported in CPU PaddlePaddle"
+            ):
+                paddle.cuda.max_memory_allocated()
+            with self.assertRaisesRegex(
+                ValueError, "not supported in CPU PaddlePaddle"
+            ):
+                paddle.device.max_memory_allocated()
+            with self.assertRaisesRegex(
+                ValueError, "not supported in CPU PaddlePaddle"
+            ):
+                paddle.cuda.max_memory_reserved()
+            with self.assertRaisesRegex(
+                ValueError, "not supported in CPU PaddlePaddle"
+            ):
+                paddle.device.max_memory_reserved()
+            with self.assertRaisesRegex(
+                ValueError, "not supported in CPU PaddlePaddle"
+            ):
+                paddle.cuda.reset_max_memory_allocated()
+            with self.assertRaisesRegex(
+                ValueError, "not supported in CPU PaddlePaddle"
+            ):
+                paddle.device.reset_max_memory_allocated()
+            with self.assertRaisesRegex(
+                ValueError, "not supported in CPU PaddlePaddle"
+            ):
+                paddle.cuda.reset_max_memory_reserved()
+            with self.assertRaisesRegex(
+                ValueError, "not supported in CPU PaddlePaddle"
+            ):
+                paddle.device.reset_max_memory_reserved()
+
+
 class TestDeviceAPIs(unittest.TestCase):
     """Test paddle.device APIs across different hardware types."""
 
@@ -164,6 +209,23 @@ def test_memory_apis_cuda(self):
         self.assertIsInstance(mem7, int)
         self.assertGreaterEqual(mem7, 0)
 
+        # Test max_memory_allocated with different input types
+        mem1 = paddle.cuda.max_memory_allocated()
+        self.assertIsInstance(mem1, int)
+        self.assertGreaterEqual(mem1, 0)
+
+        mem2 = paddle.cuda.max_memory_allocated('gpu:0')
+        self.assertIsInstance(mem2, int)
+        self.assertGreaterEqual(mem2, 0)
+
+        mem3 = paddle.cuda.max_memory_allocated(0)
+        self.assertIsInstance(mem3, int)
+        self.assertGreaterEqual(mem3, 0)
+
+        mem7 = paddle.cuda.max_memory_allocated(paddle.CUDAPlace(0))
+        self.assertIsInstance(mem7, int)
+        self.assertGreaterEqual(mem7, 0)
+
         # Test max_memory_reserved with different input types
         mem4 = paddle.device.max_memory_reserved()
         self.assertIsInstance(mem4, int)
@@ -173,6 +235,14 @@ def test_memory_apis_cuda(self):
         self.assertIsInstance(mem8, int)
         self.assertGreaterEqual(mem8, 0)
 
+        mem4 = paddle.cuda.max_memory_reserved()
+        self.assertIsInstance(mem4, int)
+        self.assertGreaterEqual(mem4, 0)
+
+        mem8 = paddle.cuda.max_memory_reserved('gpu:0')
+        self.assertIsInstance(mem8, int)
+        self.assertGreaterEqual(mem8, 0)
+
         mem9 = paddle.device.max_memory_reserved(0)
         self.assertIsInstance(mem9, int)
         self.assertGreaterEqual(mem9, 0)
@@ -508,11 +578,38 @@ def test_reset_memory_apis_cuda(self):
         paddle.device.reset_max_memory_allocated(0)
         paddle.device.reset_max_memory_allocated(paddle.CUDAPlace(0))
 
+        # Test reset functions with different input types
+        paddle.device.reset_peak_memory_stats()
+        paddle.device.reset_peak_memory_stats('gpu:0')
+        paddle.device.reset_peak_memory_stats('cuda:0')
+        paddle.device.reset_peak_memory_stats(0)
+        paddle.device.reset_peak_memory_stats(paddle.CUDAPlace(0))
+
+        # Test reset functions with different input types
+        paddle.cuda.reset_peak_memory_stats()
+        paddle.cuda.reset_peak_memory_stats('gpu:0')
+        paddle.cuda.reset_peak_memory_stats(0)
+        paddle.cuda.reset_peak_memory_stats(paddle.CUDAPlace(0))
+
         paddle.device.reset_max_memory_reserved()
         paddle.device.reset_max_memory_reserved('gpu:0')
+        paddle.device.reset_max_memory_reserved('cuda:0')
         paddle.device.reset_max_memory_reserved(0)
         paddle.device.reset_max_memory_reserved(paddle.CUDAPlace(0))
 
+        # Test reset functions with different input types
+        paddle.cuda.reset_max_memory_allocated()
+        paddle.cuda.reset_max_memory_allocated('gpu:0')
+        paddle.cuda.reset_max_memory_allocated('cuda:0')
+        paddle.cuda.reset_max_memory_allocated(0)
+        paddle.cuda.reset_max_memory_allocated(paddle.CUDAPlace(0))
+
+        paddle.cuda.reset_max_memory_reserved()
+        paddle.cuda.reset_max_memory_reserved('gpu:0')
+        paddle.cuda.reset_max_memory_reserved('cuda:0')
+        paddle.cuda.reset_max_memory_reserved(0)
+        paddle.cuda.reset_max_memory_reserved(paddle.CUDAPlace(0))
+
         # Check that max memory has been reset
         max_allocated_after_reset = paddle.device.max_memory_allocated()
         max_reserved_after_reset = paddle.device.max_memory_reserved()
diff --git a/test/compat/test_event_stream_apis.py b/test/compat/test_event_stream_apis.py
index 311bac55b7a1e..926f74fc0ba38 100644
--- a/test/compat/test_event_stream_apis.py
+++ b/test/compat/test_event_stream_apis.py
@@ -116,6 +116,9 @@ def _test_event_stream_apis_impl(self, device_str):
         prev_stream = paddle.device.set_stream(stream1)
         self.assertIsInstance(prev_stream, paddle.device.Stream)
 
+        prev_stream = paddle.cuda.set_stream(stream1)
+        self.assertIsInstance(prev_stream, paddle.cuda.Stream)
+
         # Test Event.record() with default stream
         event1.record()
         # Query result may be True immediately for some devices
@@ -350,5 +353,54 @@ def test_event_stream_timing_functionality(self):
             self.assertGreater(elapsed_time, 0)  # Should take some time
 
 
+class TestEventAPIs(unittest.TestCase):
+    """Unified test for paddle.Event, paddle.device.Event, and paddle.cuda.Event."""
+
+    def setUp(self):
+        if not paddle.device.is_compiled_with_cuda():
+            self.skipTest("This test requires CUDA.")
+        self.device = "gpu:0"
+        paddle.device.set_device(self.device)
+
+        self.event_classes = [
+            ("paddle.Event", paddle.Event),
+            ("paddle.cuda.Event", paddle.cuda.Event),
+        ]
+
+    def test_event_timing_consistency(self):
+        """Check timing consistency across different Event APIs."""
+        for name, EventCls in self.event_classes:
+            with self.subTest(api=name):
+                start = EventCls(enable_timing=True)
+                end = EventCls(enable_timing=True)
+
+                start.record()
+
+                x = paddle.randn([2048, 2048], dtype="float32")
+                y = paddle.randn([2048, 2048], dtype="float32")
+                z = paddle.matmul(x, y)
+                _ = z.mean()
+
+                end.record()
+                end.synchronize()
+
+                elapsed = start.elapsed_time(end)
+                self.assertIsInstance(elapsed, (int, float))
+                self.assertGreater(
+                    elapsed,
+                    0.0,
+                    f"{name} should measure positive elapsed time.",
+                )
+
+    def test_event_methods_available(self):
+        """Ensure all Event variants expose expected methods."""
+        for name, EventCls in self.event_classes:
+            with self.subTest(api=name):
+                e = EventCls(enable_timing=True)
+                self.assertTrue(hasattr(e, "record"))
+                self.assertTrue(hasattr(e, "synchronize"))
+                self.assertTrue(hasattr(e, "elapsed_time"))
+
+
 if __name__ == '__main__':
     unittest.main()
diff --git a/test/compat/test_paddle_cuda_apis.py b/test/compat/test_paddle_cuda_apis.py
index 7c35079390373..4531a92498023 100644
--- a/test/compat/test_paddle_cuda_apis.py
+++ b/test/compat/test_paddle_cuda_apis.py
@@ -464,5 +464,58 @@ def test_set_device_invalid_param(self):
         self.assertIn("Unsupported device type", str(context.exception))
 
 
+class TestBf16Supported(unittest.TestCase):
+    def test_is_bf16_supported(self):
+        self.assertIsInstance(paddle.cuda.is_bf16_supported(), bool)
+        self.assertIsInstance(paddle.device.is_bf16_supported(), bool)
+        self.assertIsInstance(paddle.device.is_bf16_supported(True), bool)
+        self.assertIsInstance(paddle.cuda.is_bf16_supported(False), bool)
+        if should_skip_tests():
+            self.assertFalse(paddle.cuda.is_bf16_supported())
+            self.assertFalse(paddle.device.is_bf16_supported())
+
+
+class TestManualSeed(unittest.TestCase):
+    def test_device_manual_seed(self):
+        paddle.device.manual_seed(102)
+        x1 = paddle.randn([2, 3])
+
+        paddle.device.manual_seed(999)
+        x2 = paddle.randn([2, 3])
+
+        paddle.device.manual_seed(102)
+        x3 = paddle.randn([2, 3])
+
+        self.assertTrue(
+            paddle.equal_all(x1, x3),
+            "Random outputs should be identical with the same seed",
+        )
+
+        self.assertFalse(
+            paddle.equal_all(x1, x2),
+            "Random outputs should differ with different seeds",
+        )
+
+    def test_cuda_manual_seed(self):
+        paddle.cuda.manual_seed(102)
+        x1 = paddle.randn([2, 3], dtype='float32')
+
+        paddle.cuda.manual_seed(999)
+        x2 = paddle.randn([2, 3], dtype='float32')
+
+        paddle.cuda.manual_seed(102)
+        x3 = paddle.randn([2, 3], dtype='float32')
+
+        self.assertTrue(
+            paddle.equal_all(x1, x3),
+            "Random outputs should be identical with the same seed",
+        )
+
+        self.assertFalse(
+            paddle.equal_all(x1, x2),
+            "Random outputs should differ with different seeds",
+        )
+
+
 if __name__ == '__main__':
     unittest.main()
diff --git a/test/legacy_test/test_cuda_unittest.py b/test/legacy_test/test_cuda_unittest.py
index 9e82878fe1f14..c0c4bce76ccd0 100644
--- a/test/legacy_test/test_cuda_unittest.py
+++ b/test/legacy_test/test_cuda_unittest.py
@@ -151,6 +151,22 @@ def test_get_default_device(self):
         if paddle.is_compiled_with_cuda():
             self.assertEqual(paddle.get_default_device(), paddle.device('cuda'))
 
+    def test_get_device(self):
+        x_cpu = paddle.to_tensor([1, 2, 3], place=paddle.CPUPlace())
+        self.assertEqual(paddle.get_device(x_cpu), -1)
+        if paddle.device.is_compiled_with_cuda():
+            x_gpu = paddle.to_tensor([1, 2, 3], place=paddle.CUDAPlace(0))
+            self.assertEqual(paddle.get_device(x_gpu), 0)
+
+    def test_set_default_device(self):
+        if paddle.is_compiled_with_cuda():
+            paddle.set_default_device("gpu")
+            self.assertEqual(paddle.get_default_device(), paddle.device('cuda'))
+
+        if paddle.is_compiled_with_xpu():
+            paddle.set_default_device("xpu")
+            self.assertEqual(paddle.get_default_device(), paddle.device('xpu'))
+
     @unittest.skipIf(
         (
             not paddle.device.is_compiled_with_cuda()
@@ -347,5 +363,21 @@ def test_get_stream_from_external(self):
         )
 
 
+class TestDeviceDvice(unittest.TestCase):
+    def test_device_device(self):
+        current = paddle.device.get_device()
+        with paddle.device.device("cpu"):
+            self.assertEqual(paddle.device.get_device(), 'cpu')
+        self.assertEqual(paddle.device.get_device(), current)
+
+
+class TestCudaDvice(unittest.TestCase):
+    def test_device_device(self):
+        current = paddle.device.get_device()
+        with paddle.cuda.device("cpu"):
+            self.assertEqual(paddle.device.get_device(), 'cpu')
+        self.assertEqual(paddle.device.get_device(), current)
+
+
 if __name__ == '__main__':
     unittest.main()