diff --git a/engineV2-README.md b/engineV2-README.md
index 5ce90f76..df2ab557 100644
--- a/engineV2-README.md
+++ b/engineV2-README.md
@@ -86,6 +86,8 @@
 | `--timeout`                      | int   | 单个测试用例执行超时秒数（默认 1800）                                                  |
 | `--show_runtime_status`          | bool  | 是否实时显示当前的测试进度（默认 True）                                               |
 | `--random_seed`                  | int   | numpy random的随机种子(默认为0，此时不会显式设置numpy random的seed)                   |
+| `--custom_device_vs_gpu`        | bool  | 启用自定义设备与GPU的精度对比测试模式（默认 False）                                   |
+| `--custom_device_vs_gpu_mode`   | str   | 自定义设备与GPU对比的模式：`upload` 或 `download`（默认 `upload`）                    |
 | `--bitwise_alignment`            | bool  | 是否进行诸位对齐对比，开启后所有的api的精度对比都按照atol=0.0,rtol = 0.0的精度对比结果|
 
 
@@ -123,6 +125,67 @@ python engineV2.py --accuracy=True --api_config_file="tester/api_config/api_conf
 ```
 该脚本使用参数：`NUM_GPUS=-1, NUM_WORKERS_PER_GPU=-1, GPU_IDS="4,5,6,7"`，在后台运行程序，可在修改 `run.sh` 参数后使用
 
+### 自定义设备与 GPU 精度对比测试
+
+#### 功能说明
+
+`APITestPaddleDeviceVSGPU` 类支持跨设备的精度对比测试，目前主要面向 **GPU 上传 + XPU（或其他设备）下载对比** 这一典型场景。该功能分为两个模式：
+
+- **Upload 模式（GPU 侧）**：在 GPU 上执行测试，保存结果到本地，然后上传到 BOS 云存储
+- **Download 模式（XPU/其他设备侧）**：在 XPU 或其他设备上执行测试，从 BOS 下载 GPU 侧的参考数据进行精度对比
+
+#### 工作流程
+
+1. **Upload 模式工作流（GPU 侧）**：
+   - 在 GPU 设备上执行 Paddle API 测试
+   - 保存 Forward 输出和 Backward 梯度到本地 PDTensor 文件
+   - 文件名依赖随机种子与配置哈希（如 `1210-xxx.pdtensor`）
+   - 使用 bcecmd 工具将文件上传到 BOS 云存储
+
+2. **Download 模式工作流（XPU/其他设备侧）**：
+   - 在 XPU 或其他设备上执行相同的 Paddle API 测试
+   - 使用与 GPU 侧上传时一致的随机种子和配置，构造同名 PDTensor 文件名
+   - 从 BOS 云存储下载对应的 GPU 参考数据
+   - 对比 Forward 输出和 Backward 梯度，验证与 GPU 的精度一致性
+
+#### 配置文件设置
+
+首先，编辑 `tester/bos_config.yaml` 配置文件：
+
+```yaml
+# BOS 配置文件
+# 用于自定义设备与 GPU 精度对比测试的云存储配置
+
+# BOS 存储路径（如：xly-devops/liujingzong/）
+bos_path: "xly-devops/liujingzong/"
+
+# BOS 配置文件路径（bcecmd 使用的配置文件路径）
+bos_conf_path: "./conf"
+
+# bcecmd 命令行工具路径
+bcecmd_path: "./bcecmd"
+```
+
+#### 命令示例
+**在 GPU 上执行测试并上传结果**
+```bash
+# 在 GPU 设备上执行，生成1210-xxx.pdtensor 文件并上传到 BOS
+python engineV2.py --custom_device_vs_gpu=True \
+  --custom_device_vs_gpu_mode=upload \
+  --random_seed=1210 \
+  --api_config_file="./test1.txt" \
+  --gpu_ids=7
+```
+
+**在 XPU 上下载 GPU 的参考数据并进行精度对比**
+```bash
+python engineV2.py --custom_device_vs_gpu=True \
+  --custom_device_vs_gpu_mode=download \
+  --random_seed=1210 \
+  --api_config_file="./test1.txt" \
+  --gpu_ids=7
+```
+
 ## 监控方法
 
 执行 `run.sh` 后可通过以下方式监控：
diff --git a/engineV2.py b/engineV2.py
index 96c14e6a..8e901239 100644
--- a/engineV2.py
+++ b/engineV2.py
@@ -13,10 +13,12 @@
 from concurrent.futures import TimeoutError, as_completed
 from datetime import datetime
 from multiprocessing import Lock, Manager, cpu_count, set_start_method
+from pathlib import Path
 from typing import TYPE_CHECKING
 
 import numpy as np
 import pynvml
+import yaml
 from pebble import ProcessExpired, ProcessPool
 
 if TYPE_CHECKING:
@@ -28,6 +30,7 @@
         APITestAccuracyStable,
         APITestCINNVSDygraph,
         APITestCustomDeviceVSCPU,
+        APITestPaddleDeviceVSGPU,
         APITestPaddleGPUPerformance,
         APITestPaddleOnly,
         APITestPaddleTorchGPUPerformance,
@@ -39,7 +42,18 @@
 os.environ["FLAGS_USE_SYSTEM_ALLOCATOR"] = "1"
 os.environ["NVIDIA_TF32_OVERRIDE"] = "0"
 
-VALID_TEST_ARGS = {"test_amp", "test_backward", "atol", "rtol", "test_tol"}
+VALID_TEST_ARGS = {
+    "test_amp",
+    "test_backward",
+    "atol",
+    "rtol",
+    "test_tol",
+    "operation_mode",
+    "bos_path",
+    "random_seed",
+    "bos_conf_path",
+    "bcecmd_path",
+}
 
 DEVICE_TYPE = None
 DEVICE_TYPE_DETECTED = False
@@ -123,7 +137,7 @@ def detect_device_type() -> str:
         try:
             out = subprocess.check_output(["ixsmi"], text=True, stderr=subprocess.STDOUT)
             if any(re.match(r"^\|\s*\d+\s+Iluvatar", line) for line in out.splitlines()):
-                DEVICE_TYPE = "iluvatar"
+                DEVICE_TYPE = "iluvatar_gpu"
                 DEVICE_TYPE_DETECTED = True
                 return DEVICE_TYPE
         except Exception:
@@ -164,7 +178,7 @@ def get_device_count() -> int:
         DEVICE_COUNT = len(ids)
         return DEVICE_COUNT
 
-    if device_type == "iluvatar":
+    if device_type == "iluvatar_gpu":
         out = subprocess.check_output(["ixsmi"], text=True, stderr=subprocess.STDOUT)
         ids = set()
         for line in out.splitlines():
@@ -202,7 +216,7 @@ def _refresh_snapshot(device_type):
                         snapshot[dev_id] = (total_mib / 1024.0, used_mib / 1024.0)
                         break
 
-    elif device_type == "iluvatar":
+    elif device_type == "iluvatar_gpu":
         out = subprocess.check_output(["ixsmi"], text=True, stderr=subprocess.STDOUT)
         lines = out.splitlines()
         for i, line in enumerate(lines):
@@ -240,7 +254,7 @@ def get_memory_info(gpu_id):
         finally:
             pynvml.nvmlShutdown()
 
-    if device_type in ("xpu", "iluvatar"):
+    if device_type in ("xpu", "iluvatar_gpu"):
         _refresh_snapshot(device_type)
         if _MEM_SNAPSHOT is None or gpu_id not in _MEM_SNAPSHOT:
             raise RuntimeError(f"Failed to get memory info for {device_type} device {gpu_id}")
@@ -379,6 +393,7 @@ def pid_exists(pid):
             APITestAccuracyStable,
             APITestCINNVSDygraph,
             APITestCustomDeviceVSCPU,
+            APITestPaddleDeviceVSGPU,
             APITestPaddleGPUPerformance,
             APITestPaddleOnly,
             APITestPaddleTorchGPUPerformance,
@@ -395,6 +410,7 @@ def pid_exists(pid):
             "APITestPaddleTorchGPUPerformance": APITestPaddleTorchGPUPerformance,
             "APITestAccuracyStable": APITestAccuracyStable,
             "APITestCustomDeviceVSCPU": APITestCustomDeviceVSCPU,
+            "APITestPaddleDeviceVSGPU": APITestPaddleDeviceVSGPU,
         }
         globals().update(test_classes)
 
@@ -463,7 +479,9 @@ def run_test_case(api_config_str, options):
         "paddle_torch_gpu_performance": APITestPaddleTorchGPUPerformance,
         "accuracy_stable": APITestAccuracyStable,
         "paddle_custom_device": APITestCustomDeviceVSCPU,
+        "custom_device_vs_gpu": APITestPaddleDeviceVSGPU,
     }
+
     test_class = next(
         (cls for opt, cls in option_to_class.items() if getattr(options, opt, False)),
         APITestAccuracy,  # default fallback
@@ -643,6 +661,19 @@ def main():
         default=0,
         help="The numpy random seed ",
     )
+    parser.add_argument(
+        "--custom_device_vs_gpu",
+        type=parse_bool,
+        default=False,
+        help="test paddle api on custom device vs GPU",
+    )
+    parser.add_argument(
+        "--custom_device_vs_gpu_mode",
+        type=str,
+        choices=["upload", "download"],
+        default="upload",
+        help="operation mode for custom_device_vs_gpu: 'upload' or 'download'",
+    )
     parser.add_argument(
         "--bitwise_alignment",
         type=bool,
@@ -664,6 +695,7 @@ def main():
         options.paddle_torch_gpu_performance,
         options.accuracy_stable,
         options.paddle_custom_device,
+        options.custom_device_vs_gpu,
     ]
     if len([m for m in mode if m is True]) != 1:
         print(
@@ -676,10 +708,45 @@ def main():
             "--paddle_torch_gpu_performance"
             "--accuracy_stable"
             "--paddle_custom_device"
-            " to True.",
+            "--custom_device_vs_gpu",
             flush=True,
         )
         return
+
+    # 处理 custom_device_vs_gpu 模式的配置
+    bos_config_data = None
+    if options.custom_device_vs_gpu:
+        # 读取 BOS 配置文件（固定路径：tester/bos_config.yaml）
+        bos_config_path = Path("tester/bos_config.yaml")
+        if not bos_config_path.exists():
+            print(f"BOS config file not found: {bos_config_path}", flush=True)
+            return
+
+        try:
+            with open(bos_config_path, encoding="utf-8") as f:
+                bos_config_data = yaml.safe_load(f)
+
+            if not bos_config_data:
+                print(f"BOS config file is empty: {bos_config_path}", flush=True)
+                return
+
+            # 验证必需的配置项
+            required_keys = ["bos_path", "bos_conf_path", "bcecmd_path"]
+            missing_keys = [key for key in required_keys if key not in bos_config_data]
+            if missing_keys:
+                print(f"Missing required keys in BOS config: {missing_keys}", flush=True)
+                return
+
+            # 将配置添加到 options 中，以便传递给测试类
+            options.operation_mode = options.custom_device_vs_gpu_mode
+            options.bos_path = bos_config_data["bos_path"]
+            options.bos_conf_path = bos_config_data["bos_conf_path"]
+            options.bcecmd_path = bos_config_data["bcecmd_path"]
+
+        except Exception as e:
+            print(f"Failed to load BOS config file {bos_config_path}: {e}", flush=True)
+            return
+
     if options.test_tol and not options.accuracy:
         print("--test_tol takes effect when --accuracy is True.", flush=True)
     if options.test_backward and not options.paddle_cinn:
@@ -698,6 +765,8 @@ def main():
             APITestAccuracy,
             APITestAccuracyStable,
             APITestCINNVSDygraph,
+            APITestCustomDeviceVSCPU,
+            APITestPaddleDeviceVSGPU,
             APITestPaddleGPUPerformance,
             APITestPaddleOnly,
             APITestPaddleTorchGPUPerformance,
@@ -724,13 +793,27 @@ def main():
             "paddle_torch_gpu_performance": APITestPaddleTorchGPUPerformance,
             "accuracy_stable": APITestAccuracyStable,
             "paddle_custom_device": APITestCustomDeviceVSCPU,
+            "custom_device_vs_gpu": APITestPaddleDeviceVSGPU,
         }
+
         test_class = next(
             (cls for opt, cls in option_to_class.items() if getattr(options, opt, False)),
             APITestAccuracy,  # default fallback
         )
 
-        if options.accuracy:
+        if options.custom_device_vs_gpu:
+            # custom_device_vs_gpu 模式需要传递额外参数
+            case = test_class(
+                api_config,
+                operation_mode=options.operation_mode,
+                bos_path=options.bos_path,
+                bos_conf_path=options.bos_conf_path,
+                bcecmd_path=options.bcecmd_path,
+                random_seed=options.random_seed,
+                atol=options.atol,
+                rtol=options.rtol,
+            )
+        elif options.accuracy:
             case = test_class(
                 api_config,
                 test_amp=options.test_amp,
diff --git a/tester/__init__.py b/tester/__init__.py
index 73e0969e..e721c402 100644
--- a/tester/__init__.py
+++ b/tester/__init__.py
@@ -10,6 +10,7 @@
     "APITestBase",
     "APITestCINNVSDygraph",
     "APITestCustomDeviceVSCPU",
+    "APITestPaddleDeviceVSGPU",
     "APITestPaddleGPUPerformance",
     "APITestPaddleOnly",
     "APITestPaddleTorchGPUPerformance",
@@ -38,6 +39,7 @@
     from .base import APITestBase
     from .paddle_cinn_vs_dygraph import APITestCINNVSDygraph
     from .paddle_device_vs_cpu import APITestCustomDeviceVSCPU
+    from .paddle_device_vs_gpu import APITestPaddleDeviceVSGPU
     from .paddle_gpu_performance import APITestPaddleGPUPerformance
     from .paddle_only import APITestPaddleOnly
     from .paddle_torch_gpu_performance import APITestPaddleTorchGPUPerformance
@@ -84,6 +86,10 @@ def __getattr__(name: str) -> Any:
         from .paddle_device_vs_cpu import APITestCustomDeviceVSCPU
 
         return APITestCustomDeviceVSCPU
+    elif name == "APITestPaddleDeviceVSGPU":
+        from .paddle_device_vs_gpu import APITestPaddleDeviceVSGPU
+
+        return APITestPaddleDeviceVSGPU
     elif name == "paddle_to_torch":
         from . import paddle_to_torch
 
diff --git a/tester/bos_config.yaml b/tester/bos_config.yaml
new file mode 100644
index 00000000..a981d9c9
--- /dev/null
+++ b/tester/bos_config.yaml
@@ -0,0 +1,12 @@
+# BOS 配置文件
+# 用于自定义设备与 GPU 精度对比测试的云存储配置
+
+# BOS 存储路径（如：xly-devops/liujingzong/）
+bos_path: "xly-devops/liujingzong/"
+
+# BOS 配置文件路径（bcecmd 使用的配置文件路径）
+bos_conf_path: "./conf"
+
+# bcecmd 命令行工具路径
+bcecmd_path: "./bcecmd"
+
diff --git a/tester/paddle_device_vs_gpu.py b/tester/paddle_device_vs_gpu.py
new file mode 100644
index 00000000..3694e163
--- /dev/null
+++ b/tester/paddle_device_vs_gpu.py
@@ -0,0 +1,401 @@
+from __future__ import annotations
+
+import hashlib
+import json
+import subprocess
+import tempfile
+from pathlib import Path
+
+import numpy as np
+import paddle
+
+from .api_config.log_writer import write_to_log
+from .paddle_device_vs_cpu import APITestCustomDeviceVSCPU
+
+
+class APITestPaddleDeviceVSGPU(APITestCustomDeviceVSCPU):
+    def __init__(self, api_config, **kwargs):
+        # 继承 CustomDevice vs CPU 的基本功能
+        super().__init__(api_config, **kwargs)
+
+        # 新增参数
+        self.operation_mode = kwargs.get("operation_mode", None)
+        self.bos_path = kwargs.get("bos_path", "")
+        self.random_seed = kwargs.get("random_seed", 0)
+        self.atol = kwargs.get("atol", 1e-2)
+        self.rtol = kwargs.get("rtol", 1e-2)
+        self.bcecmd_path = Path(kwargs.get("bcecmd_path", "./bcecmd")).resolve()
+        self.bos_conf_path = kwargs.get("bos_conf_path", "./conf")
+
+        # 设置随机种子确保一致性
+        if self.random_seed != 0:
+            np.random.seed(self.random_seed)
+            paddle.seed(self.random_seed)
+
+    def _get_config_hash(self):
+        """生成API配置的哈希值，用于文件名"""
+        config_str = json.dumps(
+            {
+                "api_name": self.api_config.api_name,
+                "args": [str(arg) for arg in self.api_config.args],
+                "kwargs": {k: str(v) for k, v in self.api_config.kwargs.items()},
+            },
+            sort_keys=True,
+        )
+        return hashlib.md5(config_str.encode()).hexdigest()[:16]
+
+    def _get_local_device_type(self):
+        """获取当前设备的类型，优先复用 engineV2 的检测逻辑。"""
+        from engineV2 import detect_device_type
+
+        return detect_device_type()
+
+    def _get_filename(self):
+        """生成PDTensor文件名（不再包含设备前缀，只依赖随机种子和配置哈希）"""
+        return f"{self.random_seed}-{self._get_config_hash()}.pdtensor"
+
+    def _save_tensor_locally(self, output, grads=None):
+        """保存结果到本地PDTensor文件"""
+        # 保存到临时文件
+        temp_dir = tempfile.gettempdir()
+        filename = self._get_filename()
+        local_path = Path(temp_dir) / filename
+
+        # 使用paddle.save保存张量数据
+        save_data = {"output": output}
+        if grads is not None:
+            save_data["grads"] = grads
+
+        paddle.save(save_data, str(local_path))
+        print(f"[upload] Saved pdtensor file: {local_path}", flush=True)
+        return local_path
+
+    def _build_bos_path(self, filename: str) -> str:
+        cleaned = self.bos_path.strip().lstrip("/").rstrip("/")
+        return f"bos:/{cleaned}/{filename}"
+
+    def _bcecmd_cp(self, src: str, dst: str, action: str):
+        """使用指定的 bcecmd 命令执行 cp 操作"""
+        cmd = [
+            str(self.bcecmd_path),
+            "--conf-path",
+            self.bos_conf_path,
+            "bos",
+            "cp",
+            src,
+            dst,
+        ]
+        print(f"[{action}] Running command: {' '.join(cmd)}", flush=True)
+        return subprocess.run(cmd, capture_output=True, text=True, timeout=300)
+
+    def _upload_to_bos(self, local_path):
+        """使用 bcecmd 上传文件到 BOS"""
+        if not self.bos_path:
+            print(f"[upload] No bos_path specified, skip upload", flush=True)
+            return
+
+        remote_path = self._build_bos_path(local_path.name)
+        try:
+            result = self._bcecmd_cp(str(local_path), remote_path, "upload")
+            if result.returncode == 0:
+                print(f"[upload] Upload succeeded: {remote_path}", flush=True)
+                local_path.unlink(missing_ok=True)
+            else:
+                print(
+                    f"[upload] Upload failed: {remote_path}, stderr: {result.stderr}",
+                    flush=True,
+                )
+        except Exception as e:
+            print(f"[upload] Upload failed: {e}", flush=True)
+
+    def _download_from_bos(self, filename):
+        """使用 bcecmd 从 BOS 下载文件"""
+        if not self.bos_path:
+            print(f"[download] No bos_path specified, skip download", flush=True)
+            return None
+
+        temp_dir = tempfile.gettempdir()
+        local_path = Path(temp_dir) / filename
+
+        if local_path.exists():
+            print(f"[download] File already exists locally: {local_path}", flush=True)
+            return local_path
+
+        remote_path = self._build_bos_path(filename)
+        try:
+            result = self._bcecmd_cp(remote_path, str(local_path), "download")
+            if result.returncode == 0:
+                print(f"[download] Download succeeded: {local_path}", flush=True)
+                return local_path
+            else:
+                print(
+                    f"[download] Download failed: {remote_path}, stderr: {result.stderr}",
+                    flush=True,
+                )
+                return None
+        except Exception as e:
+            print(f"[download] Download failed: {e}", flush=True)
+            return None
+
+    def _run_paddle(self, device_type: str):
+        """在指定设备上运行 Paddle（统一 GPU / XPU / 自定义设备逻辑）。"""
+        try:
+            paddle_device_type = device_type
+            if device_type == "gpu":
+                # engineV2.py sets CUDA_VISIBLE_DEVICES, so paddle will use the correct GPU.
+                paddle.set_device("gpu")
+            elif device_type == "xpu":
+                paddle.set_device(f"xpu:{self.xpu_device_id}")
+            elif device_type == self.custom_device_type and self.check_custom_device_available():
+                paddle.set_device(f"{self.custom_device_type}:{self.custom_device_id}")
+            elif device_type == "cpu":
+                paddle.set_device("cpu")
+            else:
+                print(f"[error] No custom device available", flush=True)
+                return None, None
+
+            if not self.ana_paddle_api_info():
+                print("ana_paddle_api_info failed", flush=True)
+                return None, None
+
+            if not self.gen_numpy_input():
+                print("gen_numpy_input failed", flush=True)
+                return None, None
+
+            if not self.gen_paddle_input():
+                print("gen_paddle_input failed", flush=True)
+                return None, None
+
+            paddle_output = self.paddle_api(*tuple(self.paddle_args), **self.paddle_kwargs)
+
+            paddle_grads = None
+            if self.need_check_grad():
+                inputs_list = self.get_paddle_input_list()
+                result_outputs, result_outputs_grads = self.gen_paddle_output_and_output_grad(
+                    paddle_output
+                )
+                if inputs_list and result_outputs and result_outputs_grads:
+                    paddle_grads = paddle.grad(
+                        outputs=result_outputs,
+                        inputs=inputs_list,
+                        grad_outputs=result_outputs_grads,
+                        allow_unused=True,
+                    )
+
+            return paddle_output, paddle_grads
+
+        except Exception as e:
+            print(
+                f"[paddle {paddle_device_type} error] {self.api_config.config}: {e}",
+                flush=True,
+            )
+            write_to_log("paddle_error", self.api_config.config)
+            return None, None
+
+    def _compare_with_downloaded(self, local_output, local_grads, downloaded_tensor):
+        """与下载的结果进行对比"""
+        try:
+            print(f"[compare] Comparing results for {self.api_config.config}", flush=True)
+
+            # 加载下载的数据
+            remote_data = paddle.load(str(downloaded_tensor))
+            remote_output = remote_data["output"]
+
+            # 对比Forward输出（直接使用Paddle对比）
+            try:
+                if isinstance(local_output, paddle.Tensor) and isinstance(
+                    remote_output, paddle.Tensor
+                ):
+                    # 使用Paddle的对比方法
+                    np.testing.assert_allclose(
+                        local_output.numpy(),
+                        remote_output.numpy(),
+                        atol=self.atol,
+                        rtol=self.rtol,
+                        equal_nan=True,
+                    )
+                elif isinstance(local_output, (list, tuple)) and isinstance(
+                    remote_output, (list, tuple)
+                ):
+                    # 列表或元组对比
+                    for i, (local_item, remote_item) in enumerate(zip(local_output, remote_output)):
+                        if isinstance(local_item, paddle.Tensor) and isinstance(
+                            remote_item, paddle.Tensor
+                        ):
+                            np.testing.assert_allclose(
+                                local_item.numpy(),
+                                remote_item.numpy(),
+                                atol=self.atol,
+                                rtol=self.rtol,
+                                equal_nan=True,
+                            )
+                            print(
+                                f"[compare] Forward output[{i}] comparison passed",
+                                flush=True,
+                            )
+                else:
+                    # 其他情况，尝试转换为numpy对比
+                    local_np = (
+                        local_output.numpy()
+                        if isinstance(local_output, paddle.Tensor)
+                        else np.array(local_output)
+                    )
+                    remote_np = (
+                        remote_output.numpy()
+                        if isinstance(remote_output, paddle.Tensor)
+                        else np.array(remote_output)
+                    )
+                    np.testing.assert_allclose(
+                        local_np,
+                        remote_np,
+                        atol=self.atol,
+                        rtol=self.rtol,
+                        equal_nan=True,
+                    )
+
+                print(
+                    f"[compare] Forward accuracy check passed for {self.api_config.config}",
+                    flush=True,
+                )
+            except Exception as e:
+                print(
+                    f"[compare] Forward accuracy check failed for {self.api_config.config}, error: {e}",
+                    flush=True,
+                )
+                write_to_log("accuracy_error", self.api_config.config)
+                return False
+
+            # 对比Backward梯度（如果存在且Forward通过）
+            if local_grads is not None and "grads" in remote_data:
+                remote_grads = remote_data["grads"]
+
+                try:
+                    if isinstance(local_grads, (list, tuple)) and isinstance(
+                        remote_grads, (list, tuple)
+                    ):
+                        for i, (local_grad, remote_grad) in enumerate(
+                            zip(local_grads, remote_grads)
+                        ):
+                            if isinstance(local_grad, paddle.Tensor) and isinstance(
+                                remote_grad, paddle.Tensor
+                            ):
+                                np.testing.assert_allclose(
+                                    local_grad.numpy(),
+                                    remote_grad.numpy(),
+                                    atol=self.atol,
+                                    rtol=self.rtol,
+                                    equal_nan=True,
+                                )
+                                print(
+                                    f"[compare] Backward gradient[{i}] comparison passed",
+                                    flush=True,
+                                )
+                    elif isinstance(local_grads, paddle.Tensor) and isinstance(
+                        remote_grads, paddle.Tensor
+                    ):
+                        np.testing.assert_allclose(
+                            local_grads.numpy(),
+                            remote_grads.numpy(),
+                            atol=self.atol,
+                            rtol=self.rtol,
+                            equal_nan=True,
+                        )
+
+                    print(
+                        f"[compare] Backward gradient check passed for {self.api_config.config}",
+                        flush=True,
+                    )
+                except Exception as e:
+                    print(
+                        f"[compare] Backward gradient check failed for {self.api_config.config}, error: {e}",
+                        flush=True,
+                    )
+                    return False
+
+            print(
+                f"[compare] Accuracy check passed for {self.api_config.config}",
+                flush=True,
+            )
+            write_to_log("pass", self.api_config.config)
+            return True
+
+        except Exception as e:
+            print(
+                f"[compare] Comparison failed for {self.api_config.config}, error: {e}",
+                flush=True,
+            )
+            write_to_log("accuracy_error", self.api_config.config)
+            return False
+
+    def test(self):
+        """Main test function"""
+        if self.operation_mode == "upload":
+            self._test_upload_mode()
+        elif self.operation_mode == "download":
+            self._test_download_mode()
+        else:
+            print(
+                "[error] operation_mode 不能为空，请指定 --operation_mode=upload 或 download",
+                flush=True,
+            )
+            return
+
+    def _test_upload_mode(self):
+        """Upload模式：执行测试并上传结果"""
+        print(f"[upload] Starting upload mode for {self.api_config.config}", flush=True)
+
+        local_device_type = self._get_local_device_type()
+        output, grads = self._run_paddle(local_device_type)
+
+        if output is None:
+            print(f"[upload] Execution failed for {self.api_config.config}", flush=True)
+            return
+
+        # 保存结果到本地PDTensor
+        local_path = self._save_tensor_locally(output, grads)
+
+        # 异步上传到BOS
+        self._upload_to_bos(local_path)
+
+        print(f"[upload] Upload mode completed for {self.api_config.config}", flush=True)
+
+    def _test_download_mode(self):
+        """Download模式：下载对比数据并验证"""
+        print(
+            f"[download] Starting download mode for {self.api_config.config}",
+            flush=True,
+        )
+
+        # 确定要下载的文件名（与 GPU 上传时保持一致）
+        target_filename = self._get_filename()
+
+        # 下载文件
+        downloaded_file = self._download_from_bos(target_filename)
+        if downloaded_file is None:
+            print(
+                f"[download] Failed to download comparison data for {self.api_config.config}",
+                flush=True,
+            )
+            return
+
+        # 在本地设备上执行测试
+        local_device_type = self._get_local_device_type()
+        local_output, local_grads = self._run_paddle(local_device_type)
+
+        if local_output is None:
+            print(
+                f"[download] Local execution failed for {self.api_config.config}",
+                flush=True,
+            )
+            return
+
+        # 与下载的结果进行对比
+        success = self._compare_with_downloaded(local_output, local_grads, downloaded_file)
+
+        # 清理下载的文件
+        downloaded_file.unlink(missing_ok=True)
+
+        print(
+            f"[download] Download mode completed for {self.api_config.config}",
+            flush=True,
+        )