From 40de954d7384d180da2b9648d29e1dd4d774acca Mon Sep 17 00:00:00 2001
From: ljz <470699397@qq.com>
Date: Wed, 3 Dec 2025 16:17:15 +0800
Subject: [PATCH 01/13] first step

---
 tester/gpu_custom_dump.py | 286 ++++++++++++++++++++++++++++++++++++++
 1 file changed, 286 insertions(+)
 create mode 100644 tester/gpu_custom_dump.py

diff --git a/tester/gpu_custom_dump.py b/tester/gpu_custom_dump.py
new file mode 100644
index 00000000..ec575962
--- /dev/null
+++ b/tester/gpu_custom_dump.py
@@ -0,0 +1,286 @@
+import argparse
+import os
+from datetime import datetime
+
+import paddle
+
+from . import APIConfig
+from .base import APITestBase
+
+
+class APITestGPUCustomDump(APITestBase):
+    """
+    在 GPU 与自定义设备（如 XPU / 第三方定制卡）上运行同一 API case，
+    计算前向 + 反向结果，并将结果以 npz 形式落盘。
+    """
+
+    def __init__(
+        self,
+        api_config,
+        dump_dir="report/gpu_custom_dump",
+        test_amp=False,
+        gpu_id=0,
+        custom_device_type=None,
+        custom_device_id=0,
+    ):
+        super().__init__(api_config)
+        self.dump_dir = dump_dir
+        self.test_amp = test_amp
+        self.gpu_id = gpu_id
+        self.custom_device_type = custom_device_type
+        self.custom_device_id = custom_device_id
+
+    # -------------------- 设备与落盘相关工具函数 --------------------
+    def _ensure_dirs(self, path):
+        os.makedirs(path, exist_ok=True)
+
+    def _to_tensor_list(self, x):
+        """将输出 / 梯度统一转换成 Tensor 列表，便于直接序列化保存。"""
+        if x is None:
+            return None
+        if isinstance(x, paddle.Tensor):
+            return [x]
+        if isinstance(x, (list, tuple)):
+            tensors = [t for t in x if isinstance(t, paddle.Tensor)]
+            return tensors or None
+        return None
+
+    def _dump_results(self, tag, output, grads):
+        """
+        将指定设备的前向 / 反向结果直接保存为 Tensor 列表（使用 paddle.save）：
+          <dump_dir>/<sanitized_api_name>/{tag}_forward.pdtensor
+          <dump_dir>/<sanitized_api_name>/{tag}_grad.pdtensor
+        """
+        api_name = self.api_config.config.replace("/", "_").replace(" ", "_")
+        dump_path = os.path.join(self.dump_dir, api_name)
+        self._ensure_dirs(dump_path)
+
+        out_list = self._to_tensor_list(output)
+        grad_list = self._to_tensor_list(grads)
+
+        if out_list is not None:
+            paddle.save(out_list, os.path.join(dump_path, f"{tag}_forward.pdtensor"))
+        if grad_list is not None:
+            paddle.save(grad_list, os.path.join(dump_path, f"{tag}_grad.pdtensor"))
+
+    def _run_on_device(self, device_str):
+        """
+        在指定设备上运行一次前向 + 反向，返回 (output, grads)。
+        device_str 形如：'gpu:0', 'xpu:0', 'iluvatar_gpu:0' 等。
+        """
+        import paddle
+
+        try:
+            paddle.set_device(device_str)
+        except Exception as e:
+            print(f"[device set error] {device_str} -> {e}", flush=True)
+            return None, None
+
+        if not self.gen_paddle_input():
+            print(f"[gen_paddle_input failed] device={device_str}", flush=True)
+            return None, None
+
+        # 前向
+        try:
+            if self.test_amp:
+                with paddle.amp.auto_cast():
+                    output = self.paddle_api(*tuple(self.paddle_args), **self.paddle_kwargs)
+            else:
+                output = self.paddle_api(*tuple(self.paddle_args), **self.paddle_kwargs)
+        except Exception as err:
+            print(f"[forward error] device={device_str}  {self.api_config.config}\n{err}", flush=True)
+            return None, None
+
+        # 反向
+        out_grads = None
+        if self.need_check_grad():
+            inputs_list = self.get_paddle_input_list()
+            try:
+                result_outputs, result_outputs_grads = self.gen_paddle_output_and_output_grad(output)
+            except Exception as grad_prepare_err:
+                print(
+                    f"[backward prepare error] device={device_str}  {self.api_config.config}\n{grad_prepare_err}",
+                    flush=True,
+                )
+                return output, None
+
+            if inputs_list and result_outputs and result_outputs_grads:
+                try:
+                    out_grads = paddle.grad(
+                        result_outputs,
+                        inputs_list,
+                        grad_outputs=result_outputs_grads,
+                        allow_unused=True,
+                    )
+                except Exception as grad_err:
+                    print(
+                        f"[backward error] device={device_str}  {self.api_config.config}\n{grad_err}",
+                        flush=True,
+                    )
+                    out_grads = None
+            else:
+                print(
+                    f"[backward skip] device={device_str} no valid inputs or outputs for gradient computation",
+                    flush=True,
+                )
+
+        return output, out_grads
+
+    # -------------------- 主流程：GPU vs Custom 设备 --------------------
+    def test(self):
+        # 1. 是否跳过
+        if self.need_skip():
+            print("[Skip]", self.api_config.config, flush=True)
+            return
+
+        # 2. 解析 Paddle API 信息 & 生成 numpy 输入
+        if not self.ana_paddle_api_info():
+            print("[ana_paddle_api_info failed]", self.api_config.config, flush=True)
+            return
+
+        try:
+            if not self.gen_numpy_input():
+                print("[gen_numpy_input failed]", self.api_config.config, flush=True)
+                return
+        except Exception as err:
+            print("[numpy error]", self.api_config.config, "\n", str(err), flush=True)
+            return
+
+        # 3. 确定 GPU / 自定义设备字符串
+        gpu_device_str = f"gpu:{self.gpu_id}"
+
+        if self.custom_device_type is None:
+            # 自动探测：优先 XPU，再尝试自定义设备
+            try:
+                if paddle.device.is_compiled_with_xpu():
+                    self.custom_device_type = "xpu"
+                else:
+                    custom_types = paddle.device.get_all_custom_device_type()
+                    if custom_types:
+                        self.custom_device_type = custom_types[0]
+                    else:
+                        print(
+                            "[no custom device available] "
+                            "compiled_without_xpu and no custom_device_type found.",
+                            self.api_config.config,
+                            flush=True,
+                        )
+                        return
+            except Exception as e:
+                print(f"[detect custom device error] {e}", flush=True)
+                return
+
+        custom_device_str = (
+            f"{self.custom_device_type}:{self.custom_device_id}"
+            if self.custom_device_type != "xpu"
+            else f"xpu:{self.custom_device_id}"
+        )
+
+        print(
+            f"{datetime.now()} [Begin] {self.api_config.config}\n"
+            f"  GPU device   : {gpu_device_str}\n"
+            f"  Custom device: {custom_device_str}",
+            flush=True,
+        )
+
+        # 4. GPU 上运行
+        gpu_out, gpu_grads = self._run_on_device(gpu_device_str)
+        if gpu_out is None:
+            print("[gpu execution failed]", self.api_config.config, flush=True)
+        else:
+            self._dump_results("gpu", gpu_out, gpu_grads)
+            print("[gpu dump done]", self.api_config.config, flush=True)
+
+        # 5. 自定义设备 / XPU 上运行
+        custom_out, custom_grads = self._run_on_device(custom_device_str)
+        if custom_out is None:
+            print(f"[{custom_device_str} execution failed]", self.api_config.config, flush=True)
+        else:
+            tag = self.custom_device_type if self.custom_device_type is not None else "custom"
+            self._dump_results(tag, custom_out, custom_grads)
+            print(f"[{tag} dump done]", self.api_config.config, flush=True)
+
+
+def parse_bool(v):
+    if isinstance(v, bool):
+        return v
+    s = str(v).lower()
+    if s in {"true", "1", "yes", "y"}:
+        return True
+    if s in {"false", "0", "no", "n"}:
+        return False
+    raise argparse.ArgumentTypeError(f"Invalid bool value: {v}")
+
+
+def main():
+    parser = argparse.ArgumentParser(
+        description="在 GPU / 自定义设备 上运行 API case，并将前向 + 反向结果以 npz 落盘。"
+    )
+    parser.add_argument(
+        "--api_config",
+        type=str,
+        required=True,
+        help="单条 API 配置（与 engine 中的 api_config 字符串格式一致）",
+    )
+    parser.add_argument(
+        "--dump_dir",
+        type=str,
+        default="report/gpu_custom_dump",
+        help="结果保存目录（npz 文件会按 API 配置分子目录存放）",
+    )
+    parser.add_argument(
+        "--test_amp",
+        type=parse_bool,
+        default=False,
+        help="是否在前向中启用 AMP 自动混合精度",
+    )
+    parser.add_argument(
+        "--gpu_id",
+        type=int,
+        default=0,
+        help="使用的 GPU 设备号（形如 gpu:<gpu_id>）",
+    )
+    parser.add_argument(
+        "--custom_device_type",
+        type=str,
+        default=None,
+        help="自定义设备类型名称，例如 'xpu'、'iluvatar_gpu' 等；"
+        "留空则自动探测：优先 XPU，再尝试 paddle 自定义设备。",
+    )
+    parser.add_argument(
+        "--custom_device_id",
+        type=int,
+        default=0,
+        help="自定义设备 ID，如 xpu:0 / iluvatar_gpu:0 中的 0",
+    )
+
+    args = parser.parse_args()
+
+    print(f"Options: {vars(args)}", flush=True)
+
+    try:
+        api_config = APIConfig(args.api_config.strip())
+    except Exception as err:
+        print(f"[config parse error] {args.api_config} {str(err)}", flush=True)
+        return
+
+    case = APITestGPUCustomDump(
+        api_config,
+        dump_dir=args.dump_dir,
+        test_amp=args.test_amp,
+        gpu_id=args.gpu_id,
+        custom_device_type=args.custom_device_type,
+        custom_device_id=args.custom_device_id,
+    )
+    try:
+        case.test()
+    finally:
+        case.clear_tensor()
+        del case
+        del api_config
+
+
+if __name__ == "__main__":
+    main()
+
+

From e86c71f7752ce3b60105da207b19987014d66203 Mon Sep 17 00:00:00 2001
From: ljz <470699397@qq.com>
Date: Wed, 3 Dec 2025 18:30:48 +0800
Subject: [PATCH 02/13] test0

---
 tester/gpu_custom_dump.py | 171 +++++++++++++++-----------------------
 1 file changed, 66 insertions(+), 105 deletions(-)

diff --git a/tester/gpu_custom_dump.py b/tester/gpu_custom_dump.py
index ec575962..7625895b 100644
--- a/tester/gpu_custom_dump.py
+++ b/tester/gpu_custom_dump.py
@@ -1,6 +1,8 @@
 import argparse
 import os
+import hashlib
 from datetime import datetime
+from engineV2 import detect_device_type
 
 import paddle
 
@@ -9,33 +11,20 @@
 
 
 class APITestGPUCustomDump(APITestBase):
-    """
-    在 GPU 与自定义设备（如 XPU / 第三方定制卡）上运行同一 API case，
-    计算前向 + 反向结果，并将结果以 npz 形式落盘。
-    """
-
     def __init__(
         self,
         api_config,
-        dump_dir="report/gpu_custom_dump",
+        dump_dir="gpu_custom_dump",
         test_amp=False,
-        gpu_id=0,
-        custom_device_type=None,
-        custom_device_id=0,
     ):
         super().__init__(api_config)
         self.dump_dir = dump_dir
         self.test_amp = test_amp
-        self.gpu_id = gpu_id
-        self.custom_device_type = custom_device_type
-        self.custom_device_id = custom_device_id
 
-    # -------------------- 设备与落盘相关工具函数 --------------------
     def _ensure_dirs(self, path):
         os.makedirs(path, exist_ok=True)
 
     def _to_tensor_list(self, x):
-        """将输出 / 梯度统一转换成 Tensor 列表，便于直接序列化保存。"""
         if x is None:
             return None
         if isinstance(x, paddle.Tensor):
@@ -46,11 +35,6 @@ def _to_tensor_list(self, x):
         return None
 
     def _dump_results(self, tag, output, grads):
-        """
-        将指定设备的前向 / 反向结果直接保存为 Tensor 列表（使用 paddle.save）：
-          <dump_dir>/<sanitized_api_name>/{tag}_forward.pdtensor
-          <dump_dir>/<sanitized_api_name>/{tag}_grad.pdtensor
-        """
         api_name = self.api_config.config.replace("/", "_").replace(" ", "_")
         dump_path = os.path.join(self.dump_dir, api_name)
         self._ensure_dirs(dump_path)
@@ -58,18 +42,24 @@ def _dump_results(self, tag, output, grads):
         out_list = self._to_tensor_list(output)
         grad_list = self._to_tensor_list(grads)
 
+        key = f"{tag}-{api_name}"
+        sha16 = hashlib.sha256(key.encode("utf-8")).hexdigest()[:16]
+        file_prefix = f"{tag}-{api_name}-{sha16}"
+
+        forward_path = None
+        grad_path = None
+
         if out_list is not None:
-            paddle.save(out_list, os.path.join(dump_path, f"{tag}_forward.pdtensor"))
+            forward_path = os.path.join(dump_path, f"{file_prefix}_forward.pdtensor")
+            paddle.save(out_list, forward_path)
+
         if grad_list is not None:
-            paddle.save(grad_list, os.path.join(dump_path, f"{tag}_grad.pdtensor"))
+            grad_path = os.path.join(dump_path, f"{file_prefix}_grad.pdtensor")
+            paddle.save(grad_list, grad_path)
 
-    def _run_on_device(self, device_str):
-        """
-        在指定设备上运行一次前向 + 反向，返回 (output, grads)。
-        device_str 形如：'gpu:0', 'xpu:0', 'iluvatar_gpu:0' 等。
-        """
-        import paddle
+        return forward_path, grad_path
 
+    def _run_on_device(self, device_str):
         try:
             paddle.set_device(device_str)
         except Exception as e:
@@ -80,7 +70,6 @@ def _run_on_device(self, device_str):
             print(f"[gen_paddle_input failed] device={device_str}", flush=True)
             return None, None
 
-        # 前向
         try:
             if self.test_amp:
                 with paddle.amp.auto_cast():
@@ -91,7 +80,6 @@ def _run_on_device(self, device_str):
             print(f"[forward error] device={device_str}  {self.api_config.config}\n{err}", flush=True)
             return None, None
 
-        # 反向
         out_grads = None
         if self.need_check_grad():
             inputs_list = self.get_paddle_input_list()
@@ -126,14 +114,11 @@ def _run_on_device(self, device_str):
 
         return output, out_grads
 
-    # -------------------- 主流程：GPU vs Custom 设备 --------------------
     def test(self):
-        # 1. 是否跳过
         if self.need_skip():
             print("[Skip]", self.api_config.config, flush=True)
             return
 
-        # 2. 解析 Paddle API 信息 & 生成 numpy 输入
         if not self.ana_paddle_api_info():
             print("[ana_paddle_api_info failed]", self.api_config.config, flush=True)
             return
@@ -146,59 +131,64 @@ def test(self):
             print("[numpy error]", self.api_config.config, "\n", str(err), flush=True)
             return
 
-        # 3. 确定 GPU / 自定义设备字符串
-        gpu_device_str = f"gpu:{self.gpu_id}"
+        device_type = detect_device_type()
+        try:
+            if paddle.device.is_compiled_with_cuda():
+                device_type = "gpu"
+            elif paddle.device.is_compiled_with_xpu():
+                device_type = "xpu"
+            else:
+                custom_types = paddle.device.get_all_custom_device_type()
+                if custom_types:
+                    device_type = custom_types[0]
+        except Exception as e:
+            print(f"[detect device error] {e}", flush=True)
+            return
 
-        if self.custom_device_type is None:
-            # 自动探测：优先 XPU，再尝试自定义设备
-            try:
-                if paddle.device.is_compiled_with_xpu():
-                    self.custom_device_type = "xpu"
-                else:
-                    custom_types = paddle.device.get_all_custom_device_type()
-                    if custom_types:
-                        self.custom_device_type = custom_types[0]
-                    else:
-                        print(
-                            "[no custom device available] "
-                            "compiled_without_xpu and no custom_device_type found.",
-                            self.api_config.config,
-                            flush=True,
-                        )
-                        return
-            except Exception as e:
-                print(f"[detect custom device error] {e}", flush=True)
-                return
+        if device_type is None:
+            print("[no available device]", self.api_config.config, flush=True)
+            return
 
-        custom_device_str = (
-            f"{self.custom_device_type}:{self.custom_device_id}"
-            if self.custom_device_type != "xpu"
-            else f"xpu:{self.custom_device_id}"
-        )
+        device_str = f"{device_type}:0"
 
         print(
             f"{datetime.now()} [Begin] {self.api_config.config}\n"
-            f"  GPU device   : {gpu_device_str}\n"
-            f"  Custom device: {custom_device_str}",
+            f"  Device: {device_str}",
             flush=True,
         )
 
-        # 4. GPU 上运行
-        gpu_out, gpu_grads = self._run_on_device(gpu_device_str)
-        if gpu_out is None:
-            print("[gpu execution failed]", self.api_config.config, flush=True)
+        out, grads = self._run_on_device(device_str)
+        if out is None:
+            print(f"[{device_str} execution failed]", self.api_config.config, flush=True)
         else:
-            self._dump_results("gpu", gpu_out, gpu_grads)
-            print("[gpu dump done]", self.api_config.config, flush=True)
+            forward_path, grad_path = self._dump_results(device_type, out, grads)
+            print(f"[{device_type} dump done]", self.api_config.config, flush=True)
 
-        # 5. 自定义设备 / XPU 上运行
-        custom_out, custom_grads = self._run_on_device(custom_device_str)
-        if custom_out is None:
-            print(f"[{custom_device_str} execution failed]", self.api_config.config, flush=True)
-        else:
-            tag = self.custom_device_type if self.custom_device_type is not None else "custom"
-            self._dump_results(tag, custom_out, custom_grads)
-            print(f"[{tag} dump done]", self.api_config.config, flush=True)
+            if forward_path is not None:
+                try:
+                    loaded_forward = paddle.load(forward_path)
+                    print(f"[loaded forward] {forward_path}")
+                    for i, t in enumerate(loaded_forward):
+                        arr = t.numpy().flatten()
+                        print(
+                            f"  forward[{i}] shape={t.shape}, dtype={t.dtype}, "
+                            f"first_values={arr[:10]}"
+                        )
+                except Exception as e:
+                    print(f"[load forward error] {forward_path} -> {e}", flush=True)
+
+            if grad_path is not None:
+                try:
+                    loaded_grads = paddle.load(grad_path)
+                    print(f"[loaded grad] {grad_path}")
+                    for i, t in enumerate(loaded_grads):
+                        arr = t.numpy().flatten()
+                        print(
+                            f"  grad[{i}] shape={t.shape}, dtype={t.dtype}, "
+                            f"first_values={arr[:10]}"
+                        )
+                except Exception as e:
+                    print(f"[load grad error] {grad_path} -> {e}", flush=True)
 
 
 def parse_bool(v):
@@ -213,45 +203,21 @@ def parse_bool(v):
 
 
 def main():
-    parser = argparse.ArgumentParser(
-        description="在 GPU / 自定义设备 上运行 API case，并将前向 + 反向结果以 npz 落盘。"
-    )
+    parser = argparse.ArgumentParser()
     parser.add_argument(
         "--api_config",
         type=str,
         required=True,
-        help="单条 API 配置（与 engine 中的 api_config 字符串格式一致）",
     )
     parser.add_argument(
         "--dump_dir",
         type=str,
         default="report/gpu_custom_dump",
-        help="结果保存目录（npz 文件会按 API 配置分子目录存放）",
     )
     parser.add_argument(
         "--test_amp",
         type=parse_bool,
         default=False,
-        help="是否在前向中启用 AMP 自动混合精度",
-    )
-    parser.add_argument(
-        "--gpu_id",
-        type=int,
-        default=0,
-        help="使用的 GPU 设备号（形如 gpu:<gpu_id>）",
-    )
-    parser.add_argument(
-        "--custom_device_type",
-        type=str,
-        default=None,
-        help="自定义设备类型名称，例如 'xpu'、'iluvatar_gpu' 等；"
-        "留空则自动探测：优先 XPU，再尝试 paddle 自定义设备。",
-    )
-    parser.add_argument(
-        "--custom_device_id",
-        type=int,
-        default=0,
-        help="自定义设备 ID，如 xpu:0 / iluvatar_gpu:0 中的 0",
     )
 
     args = parser.parse_args()
@@ -268,9 +234,6 @@ def main():
         api_config,
         dump_dir=args.dump_dir,
         test_amp=args.test_amp,
-        gpu_id=args.gpu_id,
-        custom_device_type=args.custom_device_type,
-        custom_device_id=args.custom_device_id,
     )
     try:
         case.test()
@@ -282,5 +245,3 @@ def main():
 
 if __name__ == "__main__":
     main()
-
-

From 137d40fe9e8c4d8e87e6e3ef770290846890e19d Mon Sep 17 00:00:00 2001
From: ljz <470699397@qq.com>
Date: Fri, 5 Dec 2025 17:19:00 +0800
Subject: [PATCH 03/13] add customvsgpu

---
 engineV2.py                    |  46 +++-
 tester/__init__.py             |   5 +
 tester/gpu_custom_dump.py      | 247 --------------------
 tester/paddle_device_vs_gpu.py | 411 +++++++++++++++++++++++++++++++++
 4 files changed, 459 insertions(+), 250 deletions(-)
 delete mode 100644 tester/gpu_custom_dump.py
 create mode 100644 tester/paddle_device_vs_gpu.py

diff --git a/engineV2.py b/engineV2.py
index 01c92a74..f0785939 100644
--- a/engineV2.py
+++ b/engineV2.py
@@ -29,6 +29,7 @@
         APITestPaddleTorchGPUPerformance,
         APITestAccuracyStable,
         APITestCustomDeviceVSCPU,
+        APITestPaddleDeviceVSGPU,
     )
     import torch
     import paddle
@@ -38,7 +39,10 @@
 os.environ["FLAGS_use_system_allocator"] = "1"
 os.environ["NVIDIA_TF32_OVERRIDE"] = "0"
 
-VALID_TEST_ARGS = {"test_amp", "test_backward", "atol", "rtol", "test_tol"}
+VALID_TEST_ARGS = {
+    "test_amp", "test_backward", "atol", "rtol", "test_tol",
+    "operation_mode", "bos_path", "target_device_type", "random_seed"
+}
 
 DEVICE_TYPE = None
 DEVICE_TYPE_DETECTED = False
@@ -384,7 +388,8 @@ def pid_exists(pid):
                             APITestPaddleOnly,
                             APITestPaddleTorchGPUPerformance,
                             APITestTorchGPUPerformance,
-                            APITestCustomDeviceVSCPU)
+                            APITestCustomDeviceVSCPU,
+                            APITestPaddleDeviceVSGPU)
 
         test_classes = {
             "APIConfig": APIConfig,
@@ -395,7 +400,8 @@ def pid_exists(pid):
             "APITestTorchGPUPerformance": APITestTorchGPUPerformance,
             "APITestPaddleTorchGPUPerformance": APITestPaddleTorchGPUPerformance,
             "APITestAccuracyStable": APITestAccuracyStable,
-            "APITestCustomDeviceVSCPU": APITestCustomDeviceVSCPU
+            "APITestCustomDeviceVSCPU": APITestCustomDeviceVSCPU,
+            "APITestPaddleDeviceVSGPU": APITestPaddleDeviceVSGPU
         }
         globals().update(test_classes)
 
@@ -466,6 +472,7 @@ def run_test_case(api_config_str, options):
         "paddle_torch_gpu_performance": APITestPaddleTorchGPUPerformance,
         "accuracy_stable": APITestAccuracyStable,
         "paddle_custom_device": APITestCustomDeviceVSCPU,
+        "custom_device_vs_gpu": APITestPaddleDeviceVSGPU,
     }
     test_class = next(
         (cls for opt, cls in option_to_class.items() if getattr(options, opt, False)),
@@ -646,6 +653,30 @@ def main():
         default=0,
         help="The numpy random seed ",
     )
+    parser.add_argument(
+        "--custom_device_vs_gpu",
+        type=parse_bool,
+        default=False,
+        help="test paddle api on custom device vs GPU",
+    )
+    parser.add_argument(
+        "--operation_mode",
+        type=str,
+        choices=["upload", "download"],
+        help="Operation mode: upload or download",
+    )
+    parser.add_argument(
+        "--bos_path",
+        type=str,
+        default="",
+        help="BOS storage path (required when operation_mode is specified)",
+    )
+    parser.add_argument(
+        "--target_device_type",
+        type=str,
+        choices=["gpu", "paddle_device"],
+        help="Target device type for download mode",
+    )
 
     options = parser.parse_args()
     print(f"Options: {vars(options)}", flush=True)
@@ -661,6 +692,7 @@ def main():
         options.paddle_torch_gpu_performance,
         options.accuracy_stable,
         options.paddle_custom_device,
+        options.custom_device_vs_gpu,
     ]
     if len([m for m in mode if m is True]) != 1:
         print(
@@ -673,10 +705,18 @@ def main():
             "--paddle_torch_gpu_performance"
             "--accuracy_stable"
             "--paddle_custom_device"
+            "--custom_device_vs_gpu"
             " to True.",
             flush=True,
         )
         return
+    if options.custom_device_vs_gpu:
+        if options.operation_mode and not options.bos_path:
+            print("--bos_path is required when --operation_mode is specified", flush=True)
+            return
+        if options.operation_mode == "download" and not options.target_device_type:
+            print("--target_device_type is required in download mode", flush=True)
+            return
     if options.test_tol and not options.accuracy:
         print(f"--test_tol takes effect when --accuracy is True.", flush=True)
     if options.test_backward and not options.paddle_cinn:
diff --git a/tester/__init__.py b/tester/__init__.py
index b758104a..01d73b45 100644
--- a/tester/__init__.py
+++ b/tester/__init__.py
@@ -12,6 +12,7 @@
     'APITestPaddleTorchGPUPerformance',
     'APITestAccuracyStable',
     'APITestCustomDeviceVSCPU',
+    'APITestPaddleDeviceVSGPU',
     'paddle_to_torch',
     'TensorConfig', 
     'APIConfig', 
@@ -32,6 +33,7 @@
     from .paddle_cinn_vs_dygraph import APITestCINNVSDygraph
     from .accuracy_stable import APITestAccuracyStable
     from .paddle_device_vs_cpu import APITestCustomDeviceVSCPU
+    from .paddle_device_vs_gpu import APITestPaddleDeviceVSGPU
     from . import paddle_to_torch
     from .api_config import (
         TensorConfig,
@@ -74,6 +76,9 @@ def __getattr__(name: str) -> Any:
     elif name == 'APITestCustomDeviceVSCPU':
         from .paddle_device_vs_cpu import APITestCustomDeviceVSCPU
         return APITestCustomDeviceVSCPU
+    elif name == 'APITestPaddleDeviceVSGPU':
+        from .paddle_device_vs_gpu import APITestPaddleDeviceVSGPU
+        return APITestPaddleDeviceVSGPU
     elif name == 'paddle_to_torch':
         from . import paddle_to_torch
         return paddle_to_torch
diff --git a/tester/gpu_custom_dump.py b/tester/gpu_custom_dump.py
deleted file mode 100644
index 7625895b..00000000
--- a/tester/gpu_custom_dump.py
+++ /dev/null
@@ -1,247 +0,0 @@
-import argparse
-import os
-import hashlib
-from datetime import datetime
-from engineV2 import detect_device_type
-
-import paddle
-
-from . import APIConfig
-from .base import APITestBase
-
-
-class APITestGPUCustomDump(APITestBase):
-    def __init__(
-        self,
-        api_config,
-        dump_dir="gpu_custom_dump",
-        test_amp=False,
-    ):
-        super().__init__(api_config)
-        self.dump_dir = dump_dir
-        self.test_amp = test_amp
-
-    def _ensure_dirs(self, path):
-        os.makedirs(path, exist_ok=True)
-
-    def _to_tensor_list(self, x):
-        if x is None:
-            return None
-        if isinstance(x, paddle.Tensor):
-            return [x]
-        if isinstance(x, (list, tuple)):
-            tensors = [t for t in x if isinstance(t, paddle.Tensor)]
-            return tensors or None
-        return None
-
-    def _dump_results(self, tag, output, grads):
-        api_name = self.api_config.config.replace("/", "_").replace(" ", "_")
-        dump_path = os.path.join(self.dump_dir, api_name)
-        self._ensure_dirs(dump_path)
-
-        out_list = self._to_tensor_list(output)
-        grad_list = self._to_tensor_list(grads)
-
-        key = f"{tag}-{api_name}"
-        sha16 = hashlib.sha256(key.encode("utf-8")).hexdigest()[:16]
-        file_prefix = f"{tag}-{api_name}-{sha16}"
-
-        forward_path = None
-        grad_path = None
-
-        if out_list is not None:
-            forward_path = os.path.join(dump_path, f"{file_prefix}_forward.pdtensor")
-            paddle.save(out_list, forward_path)
-
-        if grad_list is not None:
-            grad_path = os.path.join(dump_path, f"{file_prefix}_grad.pdtensor")
-            paddle.save(grad_list, grad_path)
-
-        return forward_path, grad_path
-
-    def _run_on_device(self, device_str):
-        try:
-            paddle.set_device(device_str)
-        except Exception as e:
-            print(f"[device set error] {device_str} -> {e}", flush=True)
-            return None, None
-
-        if not self.gen_paddle_input():
-            print(f"[gen_paddle_input failed] device={device_str}", flush=True)
-            return None, None
-
-        try:
-            if self.test_amp:
-                with paddle.amp.auto_cast():
-                    output = self.paddle_api(*tuple(self.paddle_args), **self.paddle_kwargs)
-            else:
-                output = self.paddle_api(*tuple(self.paddle_args), **self.paddle_kwargs)
-        except Exception as err:
-            print(f"[forward error] device={device_str}  {self.api_config.config}\n{err}", flush=True)
-            return None, None
-
-        out_grads = None
-        if self.need_check_grad():
-            inputs_list = self.get_paddle_input_list()
-            try:
-                result_outputs, result_outputs_grads = self.gen_paddle_output_and_output_grad(output)
-            except Exception as grad_prepare_err:
-                print(
-                    f"[backward prepare error] device={device_str}  {self.api_config.config}\n{grad_prepare_err}",
-                    flush=True,
-                )
-                return output, None
-
-            if inputs_list and result_outputs and result_outputs_grads:
-                try:
-                    out_grads = paddle.grad(
-                        result_outputs,
-                        inputs_list,
-                        grad_outputs=result_outputs_grads,
-                        allow_unused=True,
-                    )
-                except Exception as grad_err:
-                    print(
-                        f"[backward error] device={device_str}  {self.api_config.config}\n{grad_err}",
-                        flush=True,
-                    )
-                    out_grads = None
-            else:
-                print(
-                    f"[backward skip] device={device_str} no valid inputs or outputs for gradient computation",
-                    flush=True,
-                )
-
-        return output, out_grads
-
-    def test(self):
-        if self.need_skip():
-            print("[Skip]", self.api_config.config, flush=True)
-            return
-
-        if not self.ana_paddle_api_info():
-            print("[ana_paddle_api_info failed]", self.api_config.config, flush=True)
-            return
-
-        try:
-            if not self.gen_numpy_input():
-                print("[gen_numpy_input failed]", self.api_config.config, flush=True)
-                return
-        except Exception as err:
-            print("[numpy error]", self.api_config.config, "\n", str(err), flush=True)
-            return
-
-        device_type = detect_device_type()
-        try:
-            if paddle.device.is_compiled_with_cuda():
-                device_type = "gpu"
-            elif paddle.device.is_compiled_with_xpu():
-                device_type = "xpu"
-            else:
-                custom_types = paddle.device.get_all_custom_device_type()
-                if custom_types:
-                    device_type = custom_types[0]
-        except Exception as e:
-            print(f"[detect device error] {e}", flush=True)
-            return
-
-        if device_type is None:
-            print("[no available device]", self.api_config.config, flush=True)
-            return
-
-        device_str = f"{device_type}:0"
-
-        print(
-            f"{datetime.now()} [Begin] {self.api_config.config}\n"
-            f"  Device: {device_str}",
-            flush=True,
-        )
-
-        out, grads = self._run_on_device(device_str)
-        if out is None:
-            print(f"[{device_str} execution failed]", self.api_config.config, flush=True)
-        else:
-            forward_path, grad_path = self._dump_results(device_type, out, grads)
-            print(f"[{device_type} dump done]", self.api_config.config, flush=True)
-
-            if forward_path is not None:
-                try:
-                    loaded_forward = paddle.load(forward_path)
-                    print(f"[loaded forward] {forward_path}")
-                    for i, t in enumerate(loaded_forward):
-                        arr = t.numpy().flatten()
-                        print(
-                            f"  forward[{i}] shape={t.shape}, dtype={t.dtype}, "
-                            f"first_values={arr[:10]}"
-                        )
-                except Exception as e:
-                    print(f"[load forward error] {forward_path} -> {e}", flush=True)
-
-            if grad_path is not None:
-                try:
-                    loaded_grads = paddle.load(grad_path)
-                    print(f"[loaded grad] {grad_path}")
-                    for i, t in enumerate(loaded_grads):
-                        arr = t.numpy().flatten()
-                        print(
-                            f"  grad[{i}] shape={t.shape}, dtype={t.dtype}, "
-                            f"first_values={arr[:10]}"
-                        )
-                except Exception as e:
-                    print(f"[load grad error] {grad_path} -> {e}", flush=True)
-
-
-def parse_bool(v):
-    if isinstance(v, bool):
-        return v
-    s = str(v).lower()
-    if s in {"true", "1", "yes", "y"}:
-        return True
-    if s in {"false", "0", "no", "n"}:
-        return False
-    raise argparse.ArgumentTypeError(f"Invalid bool value: {v}")
-
-
-def main():
-    parser = argparse.ArgumentParser()
-    parser.add_argument(
-        "--api_config",
-        type=str,
-        required=True,
-    )
-    parser.add_argument(
-        "--dump_dir",
-        type=str,
-        default="report/gpu_custom_dump",
-    )
-    parser.add_argument(
-        "--test_amp",
-        type=parse_bool,
-        default=False,
-    )
-
-    args = parser.parse_args()
-
-    print(f"Options: {vars(args)}", flush=True)
-
-    try:
-        api_config = APIConfig(args.api_config.strip())
-    except Exception as err:
-        print(f"[config parse error] {args.api_config} {str(err)}", flush=True)
-        return
-
-    case = APITestGPUCustomDump(
-        api_config,
-        dump_dir=args.dump_dir,
-        test_amp=args.test_amp,
-    )
-    try:
-        case.test()
-    finally:
-        case.clear_tensor()
-        del case
-        del api_config
-
-
-if __name__ == "__main__":
-    main()
diff --git a/tester/paddle_device_vs_gpu.py b/tester/paddle_device_vs_gpu.py
new file mode 100644
index 00000000..fc145132
--- /dev/null
+++ b/tester/paddle_device_vs_gpu.py
@@ -0,0 +1,411 @@
+import hashlib
+import json
+import os
+import subprocess
+import tempfile
+import time
+from pathlib import Path
+
+import numpy as np
+import paddle
+
+from .api_config.log_writer import write_to_log
+from .paddle_device_vs_cpu import APITestCustomDeviceVSCPU
+
+
+class APITestPaddleDeviceVSGPU(APITestCustomDeviceVSCPU):
+    def __init__(self, api_config, **kwargs):
+        # 继承 CustomDevice vs CPU 的基本功能
+        super().__init__(api_config, **kwargs)
+        
+        # 新增参数
+        self.operation_mode = kwargs.get("operation_mode", None)
+        self.bos_path = kwargs.get("bos_path", "")
+        self.target_device_type = kwargs.get("target_device_type", "")
+        self.random_seed = kwargs.get("random_seed", 0)
+        self.atol = kwargs.get("atol", 1e-2)
+        self.rtol = kwargs.get("rtol", 1e-2)
+        
+        # 设置随机种子确保一致性
+        if self.random_seed != 0:
+            np.random.seed(self.random_seed)
+            paddle.seed(self.random_seed)
+            
+    def _get_config_hash(self):
+        """生成API配置的哈希值，用于文件名"""
+        config_str = json.dumps({
+            "api_name": self.api_config.api_name,
+            "args": [str(arg) for arg in self.api_config.args],
+            "kwargs": {k: str(v) for k, v in self.api_config.kwargs.items()}
+        }, sort_keys=True)
+        return hashlib.md5(config_str.encode()).hexdigest()[:16]
+
+    def _get_local_device_type(self):
+        """获取当前设备的类型"""
+        try:
+            if torch.cuda.is_available():  # 检查GPU是否可用
+                return "gpu"
+            elif self.check_xpu_available():
+                return "xpu"
+            elif self.check_custom_device_available():
+                return self.custom_device_type
+            else:
+                return "cpu"
+        except:
+            return "cpu"
+
+    def _get_filename(self, device_type=None):
+        """生成PDTensor文件名"""
+        if device_type is None:
+            device_type = self._get_local_device_type()
+        return f"{device_type}-{self.random_seed}-{self._get_config_hash()}.pdtensor"
+
+    def _save_tensor_locally(self, output, grads=None):
+        """保存结果到本地PDTensor文件"""
+        # 保存到临时文件
+        temp_dir = tempfile.gettempdir()
+        filename = self._get_filename().replace('.npz', '.pdtensor')
+        local_path = Path(temp_dir) / filename
+        
+        # 使用paddle.save保存张量数据
+        save_data = {'output': output}
+        if grads is not None:
+            save_data['grads'] = grads
+            
+        paddle.save(save_data, str(local_path))
+        print(f"[upload] Saved pdtensor file: {local_path}", flush=True)
+        return local_path
+
+    def _upload_to_bos(self, local_path):
+        """上传文件到指定路径，支持本地和BOS"""
+        if not self.bos_path:
+            print(f"[upload] No bos_path specified, skip upload", flush=True)
+            return
+        
+        try:
+            # 判断路径类型：本地路径还是BOS路径
+            if self.bos_path.startswith("bos://"):
+                # BOS路径：使用bcecmd工具上传
+                remote_path = f"{self.bos_path.rstrip('/')}/{local_path.name}"
+                print(f"[upload] Starting upload to BOS: {remote_path}", flush=True)
+                
+                cmd = ["bcecmd", "bos", "cp", str(local_path), remote_path]
+                result = subprocess.run(cmd, capture_output=True, text=True, timeout=300)
+                
+                if result.returncode == 0:
+                    print(f"[upload] Upload succeeded: {remote_path}", flush=True)
+                    local_path.unlink(missing_ok=True)
+                else:
+                    print(f"[upload] Upload failed: {remote_path}, error: {result.stderr}", flush=True)
+            else:
+                # 本地路径：直接复制文件
+                local_bos_path = Path(self.bos_path).resolve()
+                remote_path = local_bos_path / local_path.name
+                
+                # 确保目录存在
+                local_bos_path.mkdir(parents=True, exist_ok=True)
+                print(f"[upload] Copying file to local path: {remote_path}", flush=True)
+                
+                # 复制文件
+                import shutil
+                shutil.copy2(local_path, remote_path)
+                print(f"[upload] File copied successfully: {remote_path}", flush=True)
+                
+                # 删除临时文件
+                local_path.unlink(missing_ok=True)
+                
+        except Exception as e:
+            print(f"[upload] Upload failed: {e}", flush=True)
+
+    def _download_from_bos(self, filename):
+        """从指定路径下载文件，支持本地和BOS"""
+        if not self.bos_path:
+            print(f"[download] No bos_path specified, skip download", flush=True)
+            return None
+        
+        temp_dir = tempfile.gettempdir()
+        local_path = Path(temp_dir) / filename
+        
+        if local_path.exists():
+            print(f"[download] File already exists locally: {local_path}", flush=True)
+            return local_path
+
+        try:
+            # 判断路径类型：本地路径还是BOS路径
+            if self.bos_path.startswith("bos://"):
+                # BOS路径：使用bcecmd工具下载
+                remote_path = f"{self.bos_path.rstrip('/')}/{filename}"
+                print(f"[download] Starting download from BOS: {remote_path}", flush=True)
+                
+                cmd = ["bcecmd", "bos", "cp", remote_path, str(local_path)]
+                result = subprocess.run(cmd, capture_output=True, text=True, timeout=300)
+                
+                if result.returncode == 0:
+                    print(f"[download] Download succeeded: {local_path}", flush=True)
+                    return local_path
+                else:
+                    print(f"[download] Download failed: {remote_path}, error: {result.stderr}", flush=True)
+                    return None
+            else:
+                # 本地路径：直接复制文件
+                local_bos_path = Path(self.bos_path).resolve()
+                remote_path = local_bos_path / filename
+                
+                print(f"[download] Copying file from local path: {remote_path}", flush=True)
+                
+                if not remote_path.exists():
+                    print(f"[download] File not found: {remote_path}", flush=True)
+                    return None
+                
+                # 复制文件
+                import shutil
+                shutil.copy2(remote_path, local_path)
+                print(f"[download] File copied successfully: {local_path}", flush=True)
+                return local_path
+                
+        except Exception as e:
+            print(f"[download] Download failed: {e}", flush=True)
+            return None
+
+    def _run_paddle_on_gpu(self):
+        """在GPU上运行Paddle实现"""
+        try:
+            # 设置GPU设备
+            paddle.set_device("gpu:0")
+            
+            # 解析Paddle API信息
+            if not self.ana_paddle_api_info():
+                print("ana_paddle_api_info failed", flush=True)
+                return None, None
+
+            # 生成输入数据
+            if not self.gen_numpy_input():
+                print("gen_numpy_input failed", flush=True)
+                return None, None
+
+            if not self.gen_paddle_input():
+                print("gen_paddle_input failed", flush=True)
+                return None, None
+
+            # 执行Forward
+            paddle_output = self.paddle_api(*tuple(self.paddle_args), **self.paddle_kwargs)
+            
+            # 执行Backward（如果需要）
+            paddle_grads = None
+            if self.need_check_grad():
+                inputs_list = self.get_paddle_input_list()
+                result_outputs, result_outputs_grads = self.gen_paddle_output_and_output_grad(paddle_output)
+                if inputs_list and result_outputs and result_outputs_grads:
+                    paddle_grads = paddle.grad(
+                        outputs=result_outputs, 
+                        inputs=inputs_list, 
+                        grad_outputs=result_outputs_grads,
+                        allow_unused=True
+                    )
+            
+            return paddle_output, paddle_grads
+            
+        except Exception as e:
+            print(f"[paddle gpu error] {self.api_config.config}: {e}", flush=True)
+            write_to_log("paddle_error", self.api_config.config)
+            return None, None
+
+    def _run_paddle_on_custom_device(self):
+        """在Paddle自定义设备上运行"""
+        try:
+            paddle_device_type = "cpu"  # 默认为CPU
+            
+            # 设置自定义设备
+            if self.check_xpu_available():
+                paddle.set_device(f"xpu:{self.xpu_device_id}")
+                paddle_device_type = "xpu"
+            elif self.check_custom_device_available():
+                paddle.set_device(f"{self.custom_device_type}:{self.custom_device_id}")
+                paddle_device_type = self.custom_device_type
+            else:
+                print(f"[error] No custom device available", flush=True)
+                return None, None
+
+            # 解析Paddle API信息
+            if not self.ana_paddle_api_info():
+                print("ana_paddle_api_info failed", flush=True)
+                return None, None
+
+            # 生成输入数据
+            if not self.gen_numpy_input():
+                print("gen_numpy_input failed", flush=True)
+                return None, None
+
+            if not self.gen_paddle_input():
+                print("gen_paddle_input failed", flush=True)
+                return None, None
+
+            # 执行Forward
+            paddle_output = self.paddle_api(*tuple(self.paddle_args), **self.paddle_kwargs)
+
+            # 执行Backward（如果需要）
+            paddle_grads = None
+            if self.need_check_grad():
+                inputs_list = self.get_paddle_input_list()
+                result_outputs, result_outputs_grads = self.gen_paddle_output_and_output_grad(paddle_output)
+                if inputs_list and result_outputs and result_outputs_grads:
+                    paddle_grads = paddle.grad(
+                        outputs=result_outputs, 
+                        inputs=inputs_list, 
+                        grad_outputs=result_outputs_grads,
+                        allow_unused=True
+                    )
+            
+            return paddle_output, paddle_grads
+            
+        except Exception as e:
+            print(f"[paddle {paddle_device_type} error] {self.api_config.config}: {e}", flush=True)
+            write_to_log("paddle_error", self.api_config.config)
+            return None, None
+
+    def _compare_with_downloaded(self, local_output, local_grads, downloaded_tensor):
+        """与下载的结果进行对比"""
+        try:
+            print(f"[compare] Comparing results for {self.api_config.config}", flush=True)
+            
+            # 加载下载的数据
+            remote_data = paddle.load(str(downloaded_tensor))
+            remote_output = remote_data['output']
+            
+            # 对比Forward输出（直接使用Paddle对比）
+            try:
+                if isinstance(local_output, paddle.Tensor) and isinstance(remote_output, paddle.Tensor):
+                    # 使用Paddle的对比方法
+                    np.testing.assert_allclose(
+                        local_output.numpy(), remote_output.numpy(), 
+                        atol=self.atol, rtol=self.rtol, equal_nan=True
+                    )
+                elif isinstance(local_output, (list, tuple)) and isinstance(remote_output, (list, tuple)):
+                    # 列表或元组对比
+                    for i, (local_item, remote_item) in enumerate(zip(local_output, remote_output)):
+                        if isinstance(local_item, paddle.Tensor) and isinstance(remote_item, paddle.Tensor):
+                            np.testing.assert_allclose(
+                                local_item.numpy(), remote_item.numpy(), 
+                                atol=self.atol, rtol=self.rtol, equal_nan=True
+                            )
+                            print(f"[compare] Forward output[{i}] comparison passed", flush=True)
+                else:
+                    # 其他情况，尝试转换为numpy对比
+                    local_np = local_output.numpy() if isinstance(local_output, paddle.Tensor) else np.array(local_output)
+                    remote_np = remote_output.numpy() if isinstance(remote_output, paddle.Tensor) else np.array(remote_output)
+                    np.testing.assert_allclose(local_np, remote_np, atol=self.atol, rtol=self.rtol, equal_nan=True)
+                
+                print(f"[compare] Forward accuracy check passed for {self.api_config.config}", flush=True)
+            except Exception as e:
+                print(f"[compare] Forward accuracy check failed for {self.api_config.config}, error: {e}", flush=True)
+                write_to_log("accuracy_error", self.api_config.config)
+                return False
+            
+            # 对比Backward梯度（如果存在且Forward通过）
+            if local_grads is not None and 'grads' in remote_data:
+                remote_grads = remote_data['grads']
+                
+                try:
+                    if isinstance(local_grads, (list, tuple)) and isinstance(remote_grads, (list, tuple)):
+                        for i, (local_grad, remote_grad) in enumerate(zip(local_grads, remote_grads)):
+                            if isinstance(local_grad, paddle.Tensor) and isinstance(remote_grad, paddle.Tensor):
+                                np.testing.assert_allclose(
+                                    local_grad.numpy(), remote_grad.numpy(), 
+                                    atol=self.atol, rtol=self.rtol, equal_nan=True
+                                )
+                                print(f"[compare] Backward gradient[{i}] comparison passed", flush=True)
+                    elif isinstance(local_grads, paddle.Tensor) and isinstance(remote_grads, paddle.Tensor):
+                        np.testing.assert_allclose(
+                            local_grads.numpy(), remote_grads.numpy(), 
+                            atol=self.atol, rtol=self.rtol, equal_nan=True
+                        )
+                    
+                    print(f"[compare] Backward gradient check passed for {self.api_config.config}", flush=True)
+                except Exception as e:
+                    print(f"[compare] Backward gradient check failed for {self.api_config.config}, error: {e}", flush=True)
+                    return False
+            
+            print(f"[compare] Accuracy check passed for {self.api_config.config}", flush=True)
+            write_to_log("pass", self.api_config.config)
+            return True
+            
+        except Exception as e:
+            print(f"[compare] Comparison failed for {self.api_config.config}, error: {e}", flush=True)
+            write_to_log("accuracy_error", self.api_config.config)
+            return False
+
+    def test(self):
+        """Main test function"""
+        if self.operation_mode == "upload":
+            self._test_upload_mode()
+        elif self.operation_mode == "download":
+            self._test_download_mode()
+        else:
+            # 默认模式：本地直接对比
+            print("[info] No operation mode specified, running in local mode")
+            self._test_local_mode()
+
+    def _test_upload_mode(self):
+        """Upload模式：执行测试并上传结果"""
+        print(f"[upload] Starting upload mode for {self.api_config.config}", flush=True)
+        
+        local_device_type = self._get_local_device_type()
+        
+        if local_device_type == "gpu":
+            # GPU端：使用Paddle在GPU上执行
+            output, grads = self._run_paddle_on_gpu()
+        else:
+            # PaddleDevice端：使用Paddle在自定义设备上执行
+            output, grads = self._run_paddle_on_custom_device()
+        
+        if output is None:
+            print(f"[upload] Execution failed for {self.api_config.config}", flush=True)
+            return
+            
+        # 保存结果到本地PDTensor
+        local_path = self._save_tensor_locally(output, grads)
+        
+        # 异步上传到BOS
+        self._upload_to_bos(local_path)
+        
+        print(f"[upload] Upload mode completed for {self.api_config.config}", flush=True)
+
+    def _test_download_mode(self):
+        """Download模式：下载对比数据并验证"""
+        print(f"[download] Starting download mode for {self.api_config.config}", flush=True)
+        
+        # 确定要下载的文件名
+        target_filename = self._get_filename(self.target_device_type)
+        
+        # 下载文件
+        downloaded_file = self._download_from_bos(target_filename)
+        if downloaded_file is None:
+            print(f"[download] Failed to download comparison data for {self.api_config.config}", flush=True)
+            return
+        
+        # 在本地设备上执行测试
+        local_device_type = self._get_local_device_type()
+        
+        if local_device_type == "gpu":
+            # GPU端：使用Paddle在GPU上执行
+            local_output, local_grads = self._run_paddle_on_gpu()
+        else:
+            # PaddleDevice端：使用Paddle在自定义设备上执行
+            local_output, local_grads = self._run_paddle_on_custom_device()
+        
+        if local_output is None:
+            print(f"[download] Local execution failed for {self.api_config.config}", flush=True)
+            return
+            
+        # 与下载的结果进行对比
+        success = self._compare_with_downloaded(local_output, local_grads, downloaded_file)
+        
+        # 清理下载的文件
+        downloaded_file.unlink(missing_ok=True)
+        
+        print(f"[download] Download mode completed for {self.api_config.config}", flush=True)
+
+    def _test_local_mode(self):
+        """默认模式：本地直接对比（暂不支持）"""
+        print(f"[local] Local mode not implemented yet for {self.api_config.config}", flush=True)
+        print("[info] Please specify --operation_mode=upload or --operation_mode=download", flush=True)

From e93a4994d058134e6d8b6844c802202140632a70 Mon Sep 17 00:00:00 2001
From: ljz <470699397@qq.com>
Date: Tue, 9 Dec 2025 15:40:06 +0800
Subject: [PATCH 04/13] use bos

---
 engineV2.py                    |  25 ++-
 tester/paddle_device_vs_gpu.py | 400 ++++++++++++++++++++-------------
 2 files changed, 263 insertions(+), 162 deletions(-)

diff --git a/engineV2.py b/engineV2.py
index f0785939..89a1d08e 100644
--- a/engineV2.py
+++ b/engineV2.py
@@ -40,8 +40,17 @@
 os.environ["NVIDIA_TF32_OVERRIDE"] = "0"
 
 VALID_TEST_ARGS = {
-    "test_amp", "test_backward", "atol", "rtol", "test_tol",
-    "operation_mode", "bos_path", "target_device_type", "random_seed"
+    "test_amp",
+    "test_backward",
+    "atol",
+    "rtol",
+    "test_tol",
+    "operation_mode",
+    "bos_path",
+    "target_device_type",
+    "random_seed",
+    "bos_conf_path",
+    "bcecmd_path",
 }
 
 DEVICE_TYPE = None
@@ -671,6 +680,18 @@ def main():
         default="",
         help="BOS storage path (required when operation_mode is specified)",
     )
+    parser.add_argument(
+        "--bos_conf_path",
+        type=str,
+        default="./conf",
+        help="Path for bcecmd --conf-path when using BOS",
+    )
+    parser.add_argument(
+        "--bcecmd_path",
+        type=str,
+        default="./bcecmd",
+        help="bcecmd binary path used for BOS upload/download",
+    )
     parser.add_argument(
         "--target_device_type",
         type=str,
diff --git a/tester/paddle_device_vs_gpu.py b/tester/paddle_device_vs_gpu.py
index fc145132..a74235f0 100644
--- a/tester/paddle_device_vs_gpu.py
+++ b/tester/paddle_device_vs_gpu.py
@@ -1,9 +1,7 @@
 import hashlib
 import json
-import os
 import subprocess
 import tempfile
-import time
 from pathlib import Path
 
 import numpy as np
@@ -17,7 +15,7 @@ class APITestPaddleDeviceVSGPU(APITestCustomDeviceVSCPU):
     def __init__(self, api_config, **kwargs):
         # 继承 CustomDevice vs CPU 的基本功能
         super().__init__(api_config, **kwargs)
-        
+
         # 新增参数
         self.operation_mode = kwargs.get("operation_mode", None)
         self.bos_path = kwargs.get("bos_path", "")
@@ -25,19 +23,24 @@ def __init__(self, api_config, **kwargs):
         self.random_seed = kwargs.get("random_seed", 0)
         self.atol = kwargs.get("atol", 1e-2)
         self.rtol = kwargs.get("rtol", 1e-2)
-        
+        self.bcecmd_path = Path(kwargs.get("bcecmd_path", "./bcecmd")).resolve()
+        self.bos_conf_path = kwargs.get("bos_conf_path", "./conf")
+
         # 设置随机种子确保一致性
         if self.random_seed != 0:
             np.random.seed(self.random_seed)
             paddle.seed(self.random_seed)
-            
+
     def _get_config_hash(self):
         """生成API配置的哈希值，用于文件名"""
-        config_str = json.dumps({
-            "api_name": self.api_config.api_name,
-            "args": [str(arg) for arg in self.api_config.args],
-            "kwargs": {k: str(v) for k, v in self.api_config.kwargs.items()}
-        }, sort_keys=True)
+        config_str = json.dumps(
+            {
+                "api_name": self.api_config.api_name,
+                "args": [str(arg) for arg in self.api_config.args],
+                "kwargs": {k: str(v) for k, v in self.api_config.kwargs.items()},
+            },
+            sort_keys=True,
+        )
         return hashlib.md5(config_str.encode()).hexdigest()[:16]
 
     def _get_local_device_type(self):
@@ -64,105 +67,81 @@ def _save_tensor_locally(self, output, grads=None):
         """保存结果到本地PDTensor文件"""
         # 保存到临时文件
         temp_dir = tempfile.gettempdir()
-        filename = self._get_filename().replace('.npz', '.pdtensor')
+        filename = self._get_filename().replace(".npz", ".pdtensor")
         local_path = Path(temp_dir) / filename
-        
+
         # 使用paddle.save保存张量数据
-        save_data = {'output': output}
+        save_data = {"output": output}
         if grads is not None:
-            save_data['grads'] = grads
-            
+            save_data["grads"] = grads
+
         paddle.save(save_data, str(local_path))
         print(f"[upload] Saved pdtensor file: {local_path}", flush=True)
         return local_path
 
+    def _build_bos_path(self, filename: str) -> str:
+        cleaned = self.bos_path.strip().lstrip("/").rstrip("/")
+        return f"bos:/{cleaned}/{filename}"
+
+    def _bcecmd_cp(self, src: str, dst: str, action: str):
+        """使用指定的 bcecmd 命令执行 cp 操作"""
+        cmd = [
+            str(self.bcecmd_path),
+            "--conf-path",
+            self.bos_conf_path,
+            "bos",
+            "cp",
+            src,
+            dst,
+        ]
+        print(f"[{action}] Running command: {' '.join(cmd)}", flush=True)
+        return subprocess.run(cmd, capture_output=True, text=True, timeout=300)
+
     def _upload_to_bos(self, local_path):
-        """上传文件到指定路径，支持本地和BOS"""
+        """使用 bcecmd 上传文件到 BOS"""
         if not self.bos_path:
             print(f"[upload] No bos_path specified, skip upload", flush=True)
             return
-        
+
+        remote_path = self._build_bos_path(local_path.name)
         try:
-            # 判断路径类型：本地路径还是BOS路径
-            if self.bos_path.startswith("bos://"):
-                # BOS路径：使用bcecmd工具上传
-                remote_path = f"{self.bos_path.rstrip('/')}/{local_path.name}"
-                print(f"[upload] Starting upload to BOS: {remote_path}", flush=True)
-                
-                cmd = ["bcecmd", "bos", "cp", str(local_path), remote_path]
-                result = subprocess.run(cmd, capture_output=True, text=True, timeout=300)
-                
-                if result.returncode == 0:
-                    print(f"[upload] Upload succeeded: {remote_path}", flush=True)
-                    local_path.unlink(missing_ok=True)
-                else:
-                    print(f"[upload] Upload failed: {remote_path}, error: {result.stderr}", flush=True)
-            else:
-                # 本地路径：直接复制文件
-                local_bos_path = Path(self.bos_path).resolve()
-                remote_path = local_bos_path / local_path.name
-                
-                # 确保目录存在
-                local_bos_path.mkdir(parents=True, exist_ok=True)
-                print(f"[upload] Copying file to local path: {remote_path}", flush=True)
-                
-                # 复制文件
-                import shutil
-                shutil.copy2(local_path, remote_path)
-                print(f"[upload] File copied successfully: {remote_path}", flush=True)
-                
-                # 删除临时文件
+            result = self._bcecmd_cp(str(local_path), remote_path, "upload")
+            if result.returncode == 0:
+                print(f"[upload] Upload succeeded: {remote_path}", flush=True)
                 local_path.unlink(missing_ok=True)
-                
+            else:
+                print(
+                    f"[upload] Upload failed: {remote_path}, stderr: {result.stderr}",
+                    flush=True,
+                )
         except Exception as e:
             print(f"[upload] Upload failed: {e}", flush=True)
 
     def _download_from_bos(self, filename):
-        """从指定路径下载文件，支持本地和BOS"""
+        """使用 bcecmd 从 BOS 下载文件"""
         if not self.bos_path:
             print(f"[download] No bos_path specified, skip download", flush=True)
             return None
-        
+
         temp_dir = tempfile.gettempdir()
         local_path = Path(temp_dir) / filename
-        
+
         if local_path.exists():
             print(f"[download] File already exists locally: {local_path}", flush=True)
             return local_path
 
+        remote_path = self._build_bos_path(filename)
         try:
-            # 判断路径类型：本地路径还是BOS路径
-            if self.bos_path.startswith("bos://"):
-                # BOS路径：使用bcecmd工具下载
-                remote_path = f"{self.bos_path.rstrip('/')}/{filename}"
-                print(f"[download] Starting download from BOS: {remote_path}", flush=True)
-                
-                cmd = ["bcecmd", "bos", "cp", remote_path, str(local_path)]
-                result = subprocess.run(cmd, capture_output=True, text=True, timeout=300)
-                
-                if result.returncode == 0:
-                    print(f"[download] Download succeeded: {local_path}", flush=True)
-                    return local_path
-                else:
-                    print(f"[download] Download failed: {remote_path}, error: {result.stderr}", flush=True)
-                    return None
-            else:
-                # 本地路径：直接复制文件
-                local_bos_path = Path(self.bos_path).resolve()
-                remote_path = local_bos_path / filename
-                
-                print(f"[download] Copying file from local path: {remote_path}", flush=True)
-                
-                if not remote_path.exists():
-                    print(f"[download] File not found: {remote_path}", flush=True)
-                    return None
-                
-                # 复制文件
-                import shutil
-                shutil.copy2(remote_path, local_path)
-                print(f"[download] File copied successfully: {local_path}", flush=True)
+            result = self._bcecmd_cp(remote_path, str(local_path), "download")
+            if result.returncode == 0:
+                print(f"[download] Download succeeded: {local_path}", flush=True)
                 return local_path
-                
+            else:
+                print(
+                    f"[download] Download failed: {remote_path}, stderr: {result.stderr}",
+                    flush=True,
+                )
+                return None
         except Exception as e:
             print(f"[download] Download failed: {e}", flush=True)
             return None
@@ -172,7 +151,7 @@ def _run_paddle_on_gpu(self):
         try:
             # 设置GPU设备
             paddle.set_device("gpu:0")
-            
+
             # 解析Paddle API信息
             if not self.ana_paddle_api_info():
                 print("ana_paddle_api_info failed", flush=True)
@@ -188,23 +167,27 @@ def _run_paddle_on_gpu(self):
                 return None, None
 
             # 执行Forward
-            paddle_output = self.paddle_api(*tuple(self.paddle_args), **self.paddle_kwargs)
-            
+            paddle_output = self.paddle_api(
+                *tuple(self.paddle_args), **self.paddle_kwargs
+            )
+
             # 执行Backward（如果需要）
             paddle_grads = None
             if self.need_check_grad():
                 inputs_list = self.get_paddle_input_list()
-                result_outputs, result_outputs_grads = self.gen_paddle_output_and_output_grad(paddle_output)
+                result_outputs, result_outputs_grads = (
+                    self.gen_paddle_output_and_output_grad(paddle_output)
+                )
                 if inputs_list and result_outputs and result_outputs_grads:
                     paddle_grads = paddle.grad(
-                        outputs=result_outputs, 
-                        inputs=inputs_list, 
+                        outputs=result_outputs,
+                        inputs=inputs_list,
                         grad_outputs=result_outputs_grads,
-                        allow_unused=True
+                        allow_unused=True,
                     )
-            
+
             return paddle_output, paddle_grads
-            
+
         except Exception as e:
             print(f"[paddle gpu error] {self.api_config.config}: {e}", flush=True)
             write_to_log("paddle_error", self.api_config.config)
@@ -214,7 +197,7 @@ def _run_paddle_on_custom_device(self):
         """在Paddle自定义设备上运行"""
         try:
             paddle_device_type = "cpu"  # 默认为CPU
-            
+
             # 设置自定义设备
             if self.check_xpu_available():
                 paddle.set_device(f"xpu:{self.xpu_device_id}")
@@ -241,96 +224,171 @@ def _run_paddle_on_custom_device(self):
                 return None, None
 
             # 执行Forward
-            paddle_output = self.paddle_api(*tuple(self.paddle_args), **self.paddle_kwargs)
+            paddle_output = self.paddle_api(
+                *tuple(self.paddle_args), **self.paddle_kwargs
+            )
 
             # 执行Backward（如果需要）
             paddle_grads = None
             if self.need_check_grad():
                 inputs_list = self.get_paddle_input_list()
-                result_outputs, result_outputs_grads = self.gen_paddle_output_and_output_grad(paddle_output)
+                result_outputs, result_outputs_grads = (
+                    self.gen_paddle_output_and_output_grad(paddle_output)
+                )
                 if inputs_list and result_outputs and result_outputs_grads:
                     paddle_grads = paddle.grad(
-                        outputs=result_outputs, 
-                        inputs=inputs_list, 
+                        outputs=result_outputs,
+                        inputs=inputs_list,
                         grad_outputs=result_outputs_grads,
-                        allow_unused=True
+                        allow_unused=True,
                     )
-            
+
             return paddle_output, paddle_grads
-            
+
         except Exception as e:
-            print(f"[paddle {paddle_device_type} error] {self.api_config.config}: {e}", flush=True)
+            print(
+                f"[paddle {paddle_device_type} error] {self.api_config.config}: {e}",
+                flush=True,
+            )
             write_to_log("paddle_error", self.api_config.config)
             return None, None
 
     def _compare_with_downloaded(self, local_output, local_grads, downloaded_tensor):
         """与下载的结果进行对比"""
         try:
-            print(f"[compare] Comparing results for {self.api_config.config}", flush=True)
-            
+            print(
+                f"[compare] Comparing results for {self.api_config.config}", flush=True
+            )
+
             # 加载下载的数据
             remote_data = paddle.load(str(downloaded_tensor))
-            remote_output = remote_data['output']
-            
+            remote_output = remote_data["output"]
+
             # 对比Forward输出（直接使用Paddle对比）
             try:
-                if isinstance(local_output, paddle.Tensor) and isinstance(remote_output, paddle.Tensor):
+                if isinstance(local_output, paddle.Tensor) and isinstance(
+                    remote_output, paddle.Tensor
+                ):
                     # 使用Paddle的对比方法
                     np.testing.assert_allclose(
-                        local_output.numpy(), remote_output.numpy(), 
-                        atol=self.atol, rtol=self.rtol, equal_nan=True
+                        local_output.numpy(),
+                        remote_output.numpy(),
+                        atol=self.atol,
+                        rtol=self.rtol,
+                        equal_nan=True,
                     )
-                elif isinstance(local_output, (list, tuple)) and isinstance(remote_output, (list, tuple)):
+                elif isinstance(local_output, (list, tuple)) and isinstance(
+                    remote_output, (list, tuple)
+                ):
                     # 列表或元组对比
-                    for i, (local_item, remote_item) in enumerate(zip(local_output, remote_output)):
-                        if isinstance(local_item, paddle.Tensor) and isinstance(remote_item, paddle.Tensor):
+                    for i, (local_item, remote_item) in enumerate(
+                        zip(local_output, remote_output)
+                    ):
+                        if isinstance(local_item, paddle.Tensor) and isinstance(
+                            remote_item, paddle.Tensor
+                        ):
                             np.testing.assert_allclose(
-                                local_item.numpy(), remote_item.numpy(), 
-                                atol=self.atol, rtol=self.rtol, equal_nan=True
+                                local_item.numpy(),
+                                remote_item.numpy(),
+                                atol=self.atol,
+                                rtol=self.rtol,
+                                equal_nan=True,
+                            )
+                            print(
+                                f"[compare] Forward output[{i}] comparison passed",
+                                flush=True,
                             )
-                            print(f"[compare] Forward output[{i}] comparison passed", flush=True)
                 else:
                     # 其他情况，尝试转换为numpy对比
-                    local_np = local_output.numpy() if isinstance(local_output, paddle.Tensor) else np.array(local_output)
-                    remote_np = remote_output.numpy() if isinstance(remote_output, paddle.Tensor) else np.array(remote_output)
-                    np.testing.assert_allclose(local_np, remote_np, atol=self.atol, rtol=self.rtol, equal_nan=True)
-                
-                print(f"[compare] Forward accuracy check passed for {self.api_config.config}", flush=True)
+                    local_np = (
+                        local_output.numpy()
+                        if isinstance(local_output, paddle.Tensor)
+                        else np.array(local_output)
+                    )
+                    remote_np = (
+                        remote_output.numpy()
+                        if isinstance(remote_output, paddle.Tensor)
+                        else np.array(remote_output)
+                    )
+                    np.testing.assert_allclose(
+                        local_np,
+                        remote_np,
+                        atol=self.atol,
+                        rtol=self.rtol,
+                        equal_nan=True,
+                    )
+
+                print(
+                    f"[compare] Forward accuracy check passed for {self.api_config.config}",
+                    flush=True,
+                )
             except Exception as e:
-                print(f"[compare] Forward accuracy check failed for {self.api_config.config}, error: {e}", flush=True)
+                print(
+                    f"[compare] Forward accuracy check failed for {self.api_config.config}, error: {e}",
+                    flush=True,
+                )
                 write_to_log("accuracy_error", self.api_config.config)
                 return False
-            
+
             # 对比Backward梯度（如果存在且Forward通过）
-            if local_grads is not None and 'grads' in remote_data:
-                remote_grads = remote_data['grads']
-                
+            if local_grads is not None and "grads" in remote_data:
+                remote_grads = remote_data["grads"]
+
                 try:
-                    if isinstance(local_grads, (list, tuple)) and isinstance(remote_grads, (list, tuple)):
-                        for i, (local_grad, remote_grad) in enumerate(zip(local_grads, remote_grads)):
-                            if isinstance(local_grad, paddle.Tensor) and isinstance(remote_grad, paddle.Tensor):
+                    if isinstance(local_grads, (list, tuple)) and isinstance(
+                        remote_grads, (list, tuple)
+                    ):
+                        for i, (local_grad, remote_grad) in enumerate(
+                            zip(local_grads, remote_grads)
+                        ):
+                            if isinstance(local_grad, paddle.Tensor) and isinstance(
+                                remote_grad, paddle.Tensor
+                            ):
                                 np.testing.assert_allclose(
-                                    local_grad.numpy(), remote_grad.numpy(), 
-                                    atol=self.atol, rtol=self.rtol, equal_nan=True
+                                    local_grad.numpy(),
+                                    remote_grad.numpy(),
+                                    atol=self.atol,
+                                    rtol=self.rtol,
+                                    equal_nan=True,
+                                )
+                                print(
+                                    f"[compare] Backward gradient[{i}] comparison passed",
+                                    flush=True,
                                 )
-                                print(f"[compare] Backward gradient[{i}] comparison passed", flush=True)
-                    elif isinstance(local_grads, paddle.Tensor) and isinstance(remote_grads, paddle.Tensor):
+                    elif isinstance(local_grads, paddle.Tensor) and isinstance(
+                        remote_grads, paddle.Tensor
+                    ):
                         np.testing.assert_allclose(
-                            local_grads.numpy(), remote_grads.numpy(), 
-                            atol=self.atol, rtol=self.rtol, equal_nan=True
+                            local_grads.numpy(),
+                            remote_grads.numpy(),
+                            atol=self.atol,
+                            rtol=self.rtol,
+                            equal_nan=True,
                         )
-                    
-                    print(f"[compare] Backward gradient check passed for {self.api_config.config}", flush=True)
+
+                    print(
+                        f"[compare] Backward gradient check passed for {self.api_config.config}",
+                        flush=True,
+                    )
                 except Exception as e:
-                    print(f"[compare] Backward gradient check failed for {self.api_config.config}, error: {e}", flush=True)
+                    print(
+                        f"[compare] Backward gradient check failed for {self.api_config.config}, error: {e}",
+                        flush=True,
+                    )
                     return False
-            
-            print(f"[compare] Accuracy check passed for {self.api_config.config}", flush=True)
+
+            print(
+                f"[compare] Accuracy check passed for {self.api_config.config}",
+                flush=True,
+            )
             write_to_log("pass", self.api_config.config)
             return True
-            
+
         except Exception as e:
-            print(f"[compare] Comparison failed for {self.api_config.config}, error: {e}", flush=True)
+            print(
+                f"[compare] Comparison failed for {self.api_config.config}, error: {e}",
+                flush=True,
+            )
             write_to_log("accuracy_error", self.api_config.config)
             return False
 
@@ -348,64 +406,86 @@ def test(self):
     def _test_upload_mode(self):
         """Upload模式：执行测试并上传结果"""
         print(f"[upload] Starting upload mode for {self.api_config.config}", flush=True)
-        
+
         local_device_type = self._get_local_device_type()
-        
+
         if local_device_type == "gpu":
             # GPU端：使用Paddle在GPU上执行
             output, grads = self._run_paddle_on_gpu()
         else:
             # PaddleDevice端：使用Paddle在自定义设备上执行
             output, grads = self._run_paddle_on_custom_device()
-        
+
         if output is None:
             print(f"[upload] Execution failed for {self.api_config.config}", flush=True)
             return
-            
+
         # 保存结果到本地PDTensor
         local_path = self._save_tensor_locally(output, grads)
-        
+
         # 异步上传到BOS
         self._upload_to_bos(local_path)
-        
-        print(f"[upload] Upload mode completed for {self.api_config.config}", flush=True)
+
+        print(
+            f"[upload] Upload mode completed for {self.api_config.config}", flush=True
+        )
 
     def _test_download_mode(self):
         """Download模式：下载对比数据并验证"""
-        print(f"[download] Starting download mode for {self.api_config.config}", flush=True)
-        
+        print(
+            f"[download] Starting download mode for {self.api_config.config}",
+            flush=True,
+        )
+
         # 确定要下载的文件名
         target_filename = self._get_filename(self.target_device_type)
-        
+
         # 下载文件
         downloaded_file = self._download_from_bos(target_filename)
         if downloaded_file is None:
-            print(f"[download] Failed to download comparison data for {self.api_config.config}", flush=True)
+            print(
+                f"[download] Failed to download comparison data for {self.api_config.config}",
+                flush=True,
+            )
             return
-        
+
         # 在本地设备上执行测试
         local_device_type = self._get_local_device_type()
-        
+
         if local_device_type == "gpu":
             # GPU端：使用Paddle在GPU上执行
             local_output, local_grads = self._run_paddle_on_gpu()
         else:
             # PaddleDevice端：使用Paddle在自定义设备上执行
             local_output, local_grads = self._run_paddle_on_custom_device()
-        
+
         if local_output is None:
-            print(f"[download] Local execution failed for {self.api_config.config}", flush=True)
+            print(
+                f"[download] Local execution failed for {self.api_config.config}",
+                flush=True,
+            )
             return
-            
+
         # 与下载的结果进行对比
-        success = self._compare_with_downloaded(local_output, local_grads, downloaded_file)
-        
+        success = self._compare_with_downloaded(
+            local_output, local_grads, downloaded_file
+        )
+
         # 清理下载的文件
         downloaded_file.unlink(missing_ok=True)
-        
-        print(f"[download] Download mode completed for {self.api_config.config}", flush=True)
+
+        print(
+            f"[download] Download mode completed for {self.api_config.config}",
+            flush=True,
+        )
 
     def _test_local_mode(self):
         """默认模式：本地直接对比（暂不支持）"""
-        print(f"[local] Local mode not implemented yet for {self.api_config.config}", flush=True)
-        print("[info] Please specify --operation_mode=upload or --operation_mode=download", flush=True)
+        print(
+            f"[local] Local mode not implemented yet for {self.api_config.config}",
+            flush=True,
+        )
+        print(
+            "[info] Please specify --operation_mode=upload or --operation_mode=download",
+            flush=True,
+        )

From 511871754cb8082ac8d33670ceef23324092f838 Mon Sep 17 00:00:00 2001
From: ljz <470699397@qq.com>
Date: Tue, 9 Dec 2025 17:26:27 +0800
Subject: [PATCH 05/13] fix bugs in gpu

---
 tester/paddle_device_vs_gpu.py | 120 ++++++---------------------------
 1 file changed, 20 insertions(+), 100 deletions(-)

diff --git a/tester/paddle_device_vs_gpu.py b/tester/paddle_device_vs_gpu.py
index a74235f0..ea8f5a67 100644
--- a/tester/paddle_device_vs_gpu.py
+++ b/tester/paddle_device_vs_gpu.py
@@ -44,18 +44,9 @@ def _get_config_hash(self):
         return hashlib.md5(config_str.encode()).hexdigest()[:16]
 
     def _get_local_device_type(self):
-        """获取当前设备的类型"""
-        try:
-            if torch.cuda.is_available():  # 检查GPU是否可用
-                return "gpu"
-            elif self.check_xpu_available():
-                return "xpu"
-            elif self.check_custom_device_available():
-                return self.custom_device_type
-            else:
-                return "cpu"
-        except:
-            return "cpu"
+        """获取当前设备的类型，优先复用 engineV2 的检测逻辑。"""
+        from engineV2 import detect_device_type
+        return detect_device_type()
 
     def _get_filename(self, device_type=None):
         """生成PDTensor文件名"""
@@ -146,75 +137,27 @@ def _download_from_bos(self, filename):
             print(f"[download] Download failed: {e}", flush=True)
             return None
 
-    def _run_paddle_on_gpu(self):
-        """在GPU上运行Paddle实现"""
-        try:
-            # 设置GPU设备
-            paddle.set_device("gpu:0")
-
-            # 解析Paddle API信息
-            if not self.ana_paddle_api_info():
-                print("ana_paddle_api_info failed", flush=True)
-                return None, None
-
-            # 生成输入数据
-            if not self.gen_numpy_input():
-                print("gen_numpy_input failed", flush=True)
-                return None, None
-
-            if not self.gen_paddle_input():
-                print("gen_paddle_input failed", flush=True)
-                return None, None
-
-            # 执行Forward
-            paddle_output = self.paddle_api(
-                *tuple(self.paddle_args), **self.paddle_kwargs
-            )
-
-            # 执行Backward（如果需要）
-            paddle_grads = None
-            if self.need_check_grad():
-                inputs_list = self.get_paddle_input_list()
-                result_outputs, result_outputs_grads = (
-                    self.gen_paddle_output_and_output_grad(paddle_output)
-                )
-                if inputs_list and result_outputs and result_outputs_grads:
-                    paddle_grads = paddle.grad(
-                        outputs=result_outputs,
-                        inputs=inputs_list,
-                        grad_outputs=result_outputs_grads,
-                        allow_unused=True,
-                    )
-
-            return paddle_output, paddle_grads
-
-        except Exception as e:
-            print(f"[paddle gpu error] {self.api_config.config}: {e}", flush=True)
-            write_to_log("paddle_error", self.api_config.config)
-            return None, None
-
-    def _run_paddle_on_custom_device(self):
-        """在Paddle自定义设备上运行"""
+    def _run_paddle(self, device_type: str):
+        """在指定设备上运行 Paddle（统一 GPU / XPU / 自定义设备逻辑）。"""
         try:
-            paddle_device_type = "cpu"  # 默认为CPU
-
-            # 设置自定义设备
-            if self.check_xpu_available():
+            paddle_device_type = device_type
+            if device_type == "gpu":
+                # engineV2.py sets CUDA_VISIBLE_DEVICES, so paddle will use the correct GPU.
+                paddle.set_device("gpu")
+            elif device_type == "xpu":
                 paddle.set_device(f"xpu:{self.xpu_device_id}")
-                paddle_device_type = "xpu"
-            elif self.check_custom_device_available():
+            elif device_type == self.custom_device_type and self.check_custom_device_available():
                 paddle.set_device(f"{self.custom_device_type}:{self.custom_device_id}")
-                paddle_device_type = self.custom_device_type
+            elif device_type == "cpu":
+                paddle.set_device("cpu")
             else:
                 print(f"[error] No custom device available", flush=True)
                 return None, None
 
-            # 解析Paddle API信息
             if not self.ana_paddle_api_info():
                 print("ana_paddle_api_info failed", flush=True)
                 return None, None
 
-            # 生成输入数据
             if not self.gen_numpy_input():
                 print("gen_numpy_input failed", flush=True)
                 return None, None
@@ -223,12 +166,10 @@ def _run_paddle_on_custom_device(self):
                 print("gen_paddle_input failed", flush=True)
                 return None, None
 
-            # 执行Forward
             paddle_output = self.paddle_api(
                 *tuple(self.paddle_args), **self.paddle_kwargs
             )
 
-            # 执行Backward（如果需要）
             paddle_grads = None
             if self.need_check_grad():
                 inputs_list = self.get_paddle_input_list()
@@ -399,22 +340,18 @@ def test(self):
         elif self.operation_mode == "download":
             self._test_download_mode()
         else:
-            # 默认模式：本地直接对比
-            print("[info] No operation mode specified, running in local mode")
-            self._test_local_mode()
+            print(
+                "[error] operation_mode 不能为空，请指定 --operation_mode=upload 或 download",
+                flush=True,
+            )
+            return
 
     def _test_upload_mode(self):
         """Upload模式：执行测试并上传结果"""
         print(f"[upload] Starting upload mode for {self.api_config.config}", flush=True)
 
         local_device_type = self._get_local_device_type()
-
-        if local_device_type == "gpu":
-            # GPU端：使用Paddle在GPU上执行
-            output, grads = self._run_paddle_on_gpu()
-        else:
-            # PaddleDevice端：使用Paddle在自定义设备上执行
-            output, grads = self._run_paddle_on_custom_device()
+        output, grads = self._run_paddle(local_device_type)
 
         if output is None:
             print(f"[upload] Execution failed for {self.api_config.config}", flush=True)
@@ -451,13 +388,7 @@ def _test_download_mode(self):
 
         # 在本地设备上执行测试
         local_device_type = self._get_local_device_type()
-
-        if local_device_type == "gpu":
-            # GPU端：使用Paddle在GPU上执行
-            local_output, local_grads = self._run_paddle_on_gpu()
-        else:
-            # PaddleDevice端：使用Paddle在自定义设备上执行
-            local_output, local_grads = self._run_paddle_on_custom_device()
+        local_output, local_grads = self._run_paddle(local_device_type)
 
         if local_output is None:
             print(
@@ -478,14 +409,3 @@ def _test_download_mode(self):
             f"[download] Download mode completed for {self.api_config.config}",
             flush=True,
         )
-
-    def _test_local_mode(self):
-        """默认模式：本地直接对比（暂不支持）"""
-        print(
-            f"[local] Local mode not implemented yet for {self.api_config.config}",
-            flush=True,
-        )
-        print(
-            "[info] Please specify --operation_mode=upload or --operation_mode=download",
-            flush=True,
-        )

From c2eec21cc5845c8e034db4579f5fe2152dc8bfd9 Mon Sep 17 00:00:00 2001
From: ljz <470699397@qq.com>
Date: Tue, 9 Dec 2025 18:23:59 +0800
Subject: [PATCH 06/13] iluvatar_gpu

---
 engineV2.py | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/engineV2.py b/engineV2.py
index 89a1d08e..8cc2aa2c 100644
--- a/engineV2.py
+++ b/engineV2.py
@@ -135,7 +135,7 @@ def detect_device_type() -> str:
         try:
             out = subprocess.check_output(["ixsmi"], text=True, stderr=subprocess.STDOUT)
             if any(re.match(r"^\|\s*\d+\s+Iluvatar", line) for line in out.splitlines()):
-                DEVICE_TYPE = "iluvatar"
+                DEVICE_TYPE = "iluvatar_gpu"
                 DEVICE_TYPE_DETECTED = True
                 return DEVICE_TYPE
         except Exception:
@@ -175,7 +175,7 @@ def get_device_count() -> int:
         DEVICE_COUNT = len(ids)
         return DEVICE_COUNT
 
-    if device_type == "iluvatar":
+    if device_type == "iluvatar_gpu":
         out = subprocess.check_output(["ixsmi"], text=True, stderr=subprocess.STDOUT)
         ids = set()
         for line in out.splitlines():
@@ -214,7 +214,7 @@ def _refresh_snapshot(device_type):
                         snapshot[dev_id] = (total_mib / 1024.0, used_mib / 1024.0)
                         break
 
-    elif device_type == "iluvatar":
+    elif device_type == "iluvatar_gpu":
         out = subprocess.check_output(["ixsmi"], text=True, stderr=subprocess.STDOUT)
         lines = out.splitlines()
         for i, line in enumerate(lines):
@@ -251,7 +251,7 @@ def get_memory_info(gpu_id):
         finally:
             pynvml.nvmlShutdown()
 
-    if device_type in ("xpu", "iluvatar"):
+    if device_type in ("xpu", "iluvatar_gpu"):
         _refresh_snapshot(device_type)
         if _MEM_SNAPSHOT is None or gpu_id not in _MEM_SNAPSHOT:
             raise RuntimeError(f"Failed to get memory info for {device_type} device {gpu_id}")

From 2fa60449d6934c444c61979d5f6a720ef61b73ad Mon Sep 17 00:00:00 2001
From: ljz <470699397@qq.com>
Date: Wed, 10 Dec 2025 17:40:51 +0800
Subject: [PATCH 07/13] update readme

---
 engineV2-README.md | 82 ++++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 82 insertions(+)

diff --git a/engineV2-README.md b/engineV2-README.md
index 65912f11..d0687910 100644
--- a/engineV2-README.md
+++ b/engineV2-README.md
@@ -86,6 +86,12 @@
 | `--timeout`                      | int   | 单个测试用例执行超时秒数（默认 1800）                                                  |
 | `--show_runtime_status`          | bool  | 是否实时显示当前的测试进度（默认 True）                                               |
 | `--random_seed`                  | int   | numpy random的随机种子(默认为0，此时不会显式设置numpy random的seed)                   |
+| `--custom_device_vs_gpu`         | bool  | 运行自定义设备与GPU的精度对比测试（默认 False）                                        |
+| `--operation_mode`               | str   | 操作模式：`upload` 或 `download`（仅在启用 `--custom_device_vs_gpu` 时有效）           |
+| `--bos_path`                     | str   | BOS 存储路径（如 `xly-devops/liujingzong/`）                                           |
+| `--bos_conf_path`                | str   | BOS 配置文件路径（默认 `./conf`）                                                      |
+| `--bcecmd_path`                  | str   | bcecmd 命令行工具路径（默认 `./bcecmd`）                                               |
+| `--target_device_type`           | str   | 目标设备类型（如 `xpu`），仅在 `operation_mode=download` 时使用                        |
 
 
 ### 示例命令
@@ -122,6 +128,82 @@ python engineV2.py --accuracy=True --api_config_file="tester/api_config/api_conf
 ```
 该脚本使用参数：`NUM_GPUS=-1, NUM_WORKERS_PER_GPU=-1, GPU_IDS="4,5,6,7"`，在后台运行程序，可在修改 `run.sh` 参数后使用
 
+### 自定义设备与 GPU 精度对比测试
+
+#### 功能说明
+
+`APITestPaddleDeviceVSGPU` 类支持跨设备的精度对比测试，特别适用于自定义设备（Custom Device）与 GPU 的一致性验证。该功能分为两个模式：
+
+- **Upload 模式**：在源设备（如 GPU）上执行测试，保存结果到本地，然后上传到 BOS 云存储
+- **Download 模式**：从 BOS 云存储下载参考数据，在目标设备（如 XPU）上执行测试，与参考数据进行精度对比
+
+#### 工作流程
+
+1. **Upload 模式工作流**：
+   - 在当前设备上执行 Paddle API 测试（可以是 GPU、XPU 或其他自定义设备）
+   - 保存 Forward 输出和 Backward 梯度到本地 PDTensor 文件
+   - 文件名自动包含当前设备类型标识（如 `xpu-1210-xxx.pdtensor`）
+   - 使用 bcecmd 工具将文件上传到 BOS 云存储
+
+2. **Download 模式工作流**：
+   - 指定 `--target_device_type` 参数，从 BOS 云存储下载该设备的参考数据
+   - 在当前设备上执行相同的 Paddle API 测试
+   - 对比 Forward 输出和 Backward 梯度，验证与参考设备的精度一致性
+
+#### 命令示例
+
+**场景 1：在 XPU 上执行测试并上传结果**
+```bash
+# 在 XPU 设备上执行，生成 xpu-1210-xxx.pdtensor 文件并上传到 BOS
+python engineV2.py --custom_device_vs_gpu=True --operation_mode=upload \
+  --bos_path="xly-devops/liujingzong/" \
+  --bos_conf_path="./conf" \
+  --bcecmd_path="./bcecmd" \
+  --random_seed=1210 \
+  --api_config_file="./test1.txt" \
+  --gpu_id=7
+```
+
+**场景 2：在 GPU 上下载 XPU 的参考数据并进行精度对比**
+```bash
+# 在 GPU 设备上执行，从 BOS 下载 XPU 的参考数据（xpu-1210-xxx.pdtensor）
+# 然后在 GPU 上运行相同的测试，对比结果验证精度一致性
+python engineV2.py --custom_device_vs_gpu=True --operation_mode=download \
+  --target_device_type=xpu \
+  --bos_path="xly-devops/liujingzong/" \
+  --bos_conf_path="./conf" \
+  --bcecmd_path="./bcecmd" \
+  --random_seed=1210 \
+  --api_config_file="./test1.txt" \
+  --gpu_id=7
+```
+
+**场景 3：在 GPU 上执行测试并上传结果**
+```bash
+# 在 GPU 设备上执行，生成 gpu-1210-xxx.pdtensor 文件并上传到 BOS
+python engineV2.py --custom_device_vs_gpu=True --operation_mode=upload \
+  --bos_path="xly-devops/liujingzong/" \
+  --bos_conf_path="./conf" \
+  --bcecmd_path="./bcecmd" \
+  --random_seed=1210 \
+  --api_config_file="./test1.txt" \
+  --gpu_id=7
+```
+
+**场景 4：在 XPU 上下载 GPU 的参考数据并进行精度对比**
+```bash
+# 在 XPU 设备上执行，从 BOS 下载 GPU 的参考数据（gpu-1210-xxx.pdtensor）
+# 然后在 XPU 上运行相同的测试，对比结果验证精度一致性
+python engineV2.py --custom_device_vs_gpu=True --operation_mode=download \
+  --target_device_type=gpu \
+  --bos_path="xly-devops/liujingzong/" \
+  --bos_conf_path="./conf" \
+  --bcecmd_path="./bcecmd" \
+  --random_seed=1210 \
+  --api_config_file="./test1.txt" \
+  --gpu_id=7
+```
+
 ## 监控方法
 
 执行 `run.sh` 后可通过以下方式监控：

From 9be47d6eb370841a515c21fdd84ef855e0e766c7 Mon Sep 17 00:00:00 2001
From: ljz <470699397@qq.com>
Date: Fri, 12 Dec 2025 14:44:45 +0800
Subject: [PATCH 08/13] =?UTF-8?q?=E5=87=8F=E5=B0=91=E5=8F=82=E6=95=B0?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 engineV2-README.md             |  54 +++++++-------
 engineV2.py                    | 126 +++++++++++++++++++++------------
 tester/bos_config.yaml         |  12 ++++
 tester/paddle_device_vs_gpu.py |   2 +-
 4 files changed, 121 insertions(+), 73 deletions(-)
 create mode 100644 tester/bos_config.yaml

diff --git a/engineV2-README.md b/engineV2-README.md
index aa6cf6cb..5709503e 100644
--- a/engineV2-README.md
+++ b/engineV2-README.md
@@ -86,12 +86,8 @@
 | `--timeout`                      | int   | 单个测试用例执行超时秒数（默认 1800）                                                  |
 | `--show_runtime_status`          | bool  | 是否实时显示当前的测试进度（默认 True）                                               |
 | `--random_seed`                  | int   | numpy random的随机种子(默认为0，此时不会显式设置numpy random的seed)                   |
-| `--custom_device_vs_gpu`         | bool  | 运行自定义设备与GPU的精度对比测试（默认 False）                                        |
-| `--operation_mode`               | str   | 操作模式：`upload` 或 `download`（仅在启用 `--custom_device_vs_gpu` 时有效）           |
-| `--bos_path`                     | str   | BOS 存储路径（如 `xly-devops/liujingzong/`）                                           |
-| `--bos_conf_path`                | str   | BOS 配置文件路径（默认 `./conf`）                                                      |
-| `--bcecmd_path`                  | str   | bcecmd 命令行工具路径（默认 `./bcecmd`）                                               |
-| `--target_device_type`           | str   | 目标设备类型（如 `xpu`），仅在 `operation_mode=download` 时使用                        |
+| `--custom_device_vs_gpu`        | str   | 运行自定义设备与GPU的精度对比测试：`upload` 或 `download`（默认 None）                  |
+| `--target_device_type`           | str   | 目标设备类型（如 `xpu`），仅在 `--custom_device_vs_gpu=download` 时使用                |
 | `--bitwise_alignment`            | bool  | 是否进行诸位对齐对比，开启后所有的api的精度对比都按照atol=0.0,rtol = 0.0的精度对比结果|
 
 
@@ -151,58 +147,64 @@ python engineV2.py --accuracy=True --api_config_file="tester/api_config/api_conf
    - 在当前设备上执行相同的 Paddle API 测试
    - 对比 Forward 输出和 Backward 梯度，验证与参考设备的精度一致性
 
+#### 配置文件设置
+
+首先，编辑 `tester/bos_config.yaml` 配置文件：
+
+```yaml
+# BOS 配置文件
+# 用于自定义设备与 GPU 精度对比测试的云存储配置
+
+# BOS 存储路径（如：xly-devops/liujingzong/）
+bos_path: "xly-devops/liujingzong/"
+
+# BOS 配置文件路径（bcecmd 使用的配置文件路径）
+bos_conf_path: "./conf"
+
+# bcecmd 命令行工具路径
+bcecmd_path: "./bcecmd"
+```
+
 #### 命令示例
 
 **场景 1：在 XPU 上执行测试并上传结果**
 ```bash
 # 在 XPU 设备上执行，生成 xpu-1210-xxx.pdtensor 文件并上传到 BOS
-python engineV2.py --custom_device_vs_gpu=True --operation_mode=upload \
-  --bos_path="xly-devops/liujingzong/" \
-  --bos_conf_path="./conf" \
-  --bcecmd_path="./bcecmd" \
+python engineV2.py --custom_device_vs_gpu=upload \
   --random_seed=1210 \
   --api_config_file="./test1.txt" \
-  --gpu_id=7
+  --gpu_ids=7
 ```
 
 **场景 2：在 GPU 上下载 XPU 的参考数据并进行精度对比**
 ```bash
 # 在 GPU 设备上执行，从 BOS 下载 XPU 的参考数据（xpu-1210-xxx.pdtensor）
 # 然后在 GPU 上运行相同的测试，对比结果验证精度一致性
-python engineV2.py --custom_device_vs_gpu=True --operation_mode=download \
+python engineV2.py --custom_device_vs_gpu=download \
   --target_device_type=xpu \
-  --bos_path="xly-devops/liujingzong/" \
-  --bos_conf_path="./conf" \
-  --bcecmd_path="./bcecmd" \
   --random_seed=1210 \
   --api_config_file="./test1.txt" \
-  --gpu_id=7
+  --gpu_ids=7
 ```
 
 **场景 3：在 GPU 上执行测试并上传结果**
 ```bash
 # 在 GPU 设备上执行，生成 gpu-1210-xxx.pdtensor 文件并上传到 BOS
-python engineV2.py --custom_device_vs_gpu=True --operation_mode=upload \
-  --bos_path="xly-devops/liujingzong/" \
-  --bos_conf_path="./conf" \
-  --bcecmd_path="./bcecmd" \
+python engineV2.py --custom_device_vs_gpu=upload \
   --random_seed=1210 \
   --api_config_file="./test1.txt" \
-  --gpu_id=7
+  --gpu_ids=7
 ```
 
 **场景 4：在 XPU 上下载 GPU 的参考数据并进行精度对比**
 ```bash
 # 在 XPU 设备上执行，从 BOS 下载 GPU 的参考数据（gpu-1210-xxx.pdtensor）
 # 然后在 XPU 上运行相同的测试，对比结果验证精度一致性
-python engineV2.py --custom_device_vs_gpu=True --operation_mode=download \
+python engineV2.py --custom_device_vs_gpu=download \
   --target_device_type=gpu \
-  --bos_path="xly-devops/liujingzong/" \
-  --bos_conf_path="./conf" \
-  --bcecmd_path="./bcecmd" \
   --random_seed=1210 \
   --api_config_file="./test1.txt" \
-  --gpu_id=7
+  --gpu_ids=7
 ```
 
 ## 监控方法
diff --git a/engineV2.py b/engineV2.py
index 116d2299..ebdeb869 100644
--- a/engineV2.py
+++ b/engineV2.py
@@ -12,10 +12,12 @@
 from concurrent.futures import TimeoutError, as_completed
 from datetime import datetime
 from multiprocessing import Lock, Manager, cpu_count, set_start_method
+from pathlib import Path
 from typing import TYPE_CHECKING
 
 import numpy as np
 import pynvml
+import yaml
 from pebble import ProcessExpired, ProcessPool
 
 if TYPE_CHECKING:
@@ -481,12 +483,16 @@ def run_test_case(api_config_str, options):
         "paddle_torch_gpu_performance": APITestPaddleTorchGPUPerformance,
         "accuracy_stable": APITestAccuracyStable,
         "paddle_custom_device": APITestCustomDeviceVSCPU,
-        "custom_device_vs_gpu": APITestPaddleDeviceVSGPU,
     }
-    test_class = next(
-        (cls for opt, cls in option_to_class.items() if getattr(options, opt, False)),
-        APITestAccuracy,  # default fallback
-    )
+    
+    # 处理 custom_device_vs_gpu 模式
+    if options.custom_device_vs_gpu:
+        test_class = APITestPaddleDeviceVSGPU
+    else:
+        test_class = next(
+            (cls for opt, cls in option_to_class.items() if getattr(options, opt, False)),
+            APITestAccuracy,  # default fallback
+        )
     kwargs = {k: v for k, v in vars(options).items() if k in VALID_TEST_ARGS}
     case = test_class(api_config, **kwargs)
     try:
@@ -664,33 +670,10 @@ def main():
     )
     parser.add_argument(
         "--custom_device_vs_gpu",
-        type=parse_bool,
-        default=False,
-        help="test paddle api on custom device vs GPU",
-    )
-    parser.add_argument(
-        "--operation_mode",
         type=str,
         choices=["upload", "download"],
-        help="Operation mode: upload or download",
-    )
-    parser.add_argument(
-        "--bos_path",
-        type=str,
-        default="",
-        help="BOS storage path (required when operation_mode is specified)",
-    )
-    parser.add_argument(
-        "--bos_conf_path",
-        type=str,
-        default="./conf",
-        help="Path for bcecmd --conf-path when using BOS",
-    )
-    parser.add_argument(
-        "--bcecmd_path",
-        type=str,
-        default="./bcecmd",
-        help="bcecmd binary path used for BOS upload/download",
+        default=None,
+        help="test paddle api on custom device vs GPU: 'upload' or 'download'",
     )
     parser.add_argument(
         "--target_device_type",
@@ -719,7 +702,7 @@ def main():
         options.paddle_torch_gpu_performance,
         options.accuracy_stable,
         options.paddle_custom_device,
-        options.custom_device_vs_gpu,
+        options.custom_device_vs_gpu is not None,
     ]
     if len([m for m in mode if m is True]) != 1:
         print(
@@ -732,16 +715,46 @@ def main():
             "--paddle_torch_gpu_performance"
             "--accuracy_stable"
             "--paddle_custom_device"
-            "--custom_device_vs_gpu"
-            " to True.",
+            "--custom_device_vs_gpu=upload or --custom_device_vs_gpu=download",
             flush=True,
         )
         return
+    
+    # 处理 custom_device_vs_gpu 模式的配置
+    bos_config_data = None
     if options.custom_device_vs_gpu:
-        if options.operation_mode and not options.bos_path:
-            print("--bos_path is required when --operation_mode is specified", flush=True)
+        # 读取 BOS 配置文件（固定路径：tester/bos_config.yaml）
+        bos_config_path = Path("tester/bos_config.yaml")
+        if not bos_config_path.exists():
+            print(f"BOS config file not found: {bos_config_path}", flush=True)
+            return
+        
+        try:
+            with open(bos_config_path, "r", encoding="utf-8") as f:
+                bos_config_data = yaml.safe_load(f)
+            
+            if not bos_config_data:
+                print(f"BOS config file is empty: {bos_config_path}", flush=True)
+                return
+            
+            # 验证必需的配置项
+            required_keys = ["bos_path", "bos_conf_path", "bcecmd_path"]
+            missing_keys = [key for key in required_keys if key not in bos_config_data]
+            if missing_keys:
+                print(f"Missing required keys in BOS config: {missing_keys}", flush=True)
+                return
+            
+            # 将配置添加到 options 中，以便传递给测试类
+            options.operation_mode = options.custom_device_vs_gpu
+            options.bos_path = bos_config_data["bos_path"]
+            options.bos_conf_path = bos_config_data["bos_conf_path"]
+            options.bcecmd_path = bos_config_data["bcecmd_path"]
+            
+        except Exception as e:
+            print(f"Failed to load BOS config file {bos_config_path}: {e}", flush=True)
             return
-        if options.operation_mode == "download" and not options.target_device_type:
+        
+        if options.custom_device_vs_gpu == "download" and not options.target_device_type:
             print("--target_device_type is required in download mode", flush=True)
             return
     if options.test_tol and not options.accuracy:
@@ -761,7 +774,8 @@ def main():
                             APITestCINNVSDygraph, APITestPaddleGPUPerformance,
                             APITestPaddleOnly,
                             APITestPaddleTorchGPUPerformance,
-                            APITestTorchGPUPerformance)
+                            APITestTorchGPUPerformance,
+                            APITestCustomDeviceVSCPU)
 
         # set log_writer
         set_engineV2()
@@ -784,16 +798,36 @@ def main():
             "accuracy_stable": APITestAccuracyStable,
             "paddle_custom_device": APITestCustomDeviceVSCPU,
         }
-        test_class = next(
-            (
-                cls
-                for opt, cls in option_to_class.items()
-                if getattr(options, opt, False)
-            ),
-            APITestAccuracy,  # default fallback
-        )
+        
+        # 处理 custom_device_vs_gpu 模式
+        if options.custom_device_vs_gpu:
+            from tester import APITestPaddleDeviceVSGPU
+            test_class = APITestPaddleDeviceVSGPU
+        else:
+            test_class = next(
+                (
+                    cls
+                    for opt, cls in option_to_class.items()
+                    if getattr(options, opt, False)
+                ),
+                APITestAccuracy,  # default fallback
+            )
 
-        if options.accuracy:
+        if options.custom_device_vs_gpu:
+            # custom_device_vs_gpu 模式需要传递额外参数
+            kwargs = {
+                "operation_mode": options.operation_mode,
+                "bos_path": options.bos_path,
+                "bos_conf_path": options.bos_conf_path,
+                "bcecmd_path": options.bcecmd_path,
+                "random_seed": options.random_seed,
+                "atol": options.atol,
+                "rtol": options.rtol,
+            }
+            if options.target_device_type:
+                kwargs["target_device_type"] = options.target_device_type
+            case = test_class(api_config, **kwargs)
+        elif options.accuracy:
             case = test_class(
                 api_config,
                 test_amp=options.test_amp,
diff --git a/tester/bos_config.yaml b/tester/bos_config.yaml
new file mode 100644
index 00000000..a981d9c9
--- /dev/null
+++ b/tester/bos_config.yaml
@@ -0,0 +1,12 @@
+# BOS 配置文件
+# 用于自定义设备与 GPU 精度对比测试的云存储配置
+
+# BOS 存储路径（如：xly-devops/liujingzong/）
+bos_path: "xly-devops/liujingzong/"
+
+# BOS 配置文件路径（bcecmd 使用的配置文件路径）
+bos_conf_path: "./conf"
+
+# bcecmd 命令行工具路径
+bcecmd_path: "./bcecmd"
+
diff --git a/tester/paddle_device_vs_gpu.py b/tester/paddle_device_vs_gpu.py
index ea8f5a67..9079cbbb 100644
--- a/tester/paddle_device_vs_gpu.py
+++ b/tester/paddle_device_vs_gpu.py
@@ -58,7 +58,7 @@ def _save_tensor_locally(self, output, grads=None):
         """保存结果到本地PDTensor文件"""
         # 保存到临时文件
         temp_dir = tempfile.gettempdir()
-        filename = self._get_filename().replace(".npz", ".pdtensor")
+        filename = self._get_filename()
         local_path = Path(temp_dir) / filename
 
         # 使用paddle.save保存张量数据

From 81d25235af546095c99a37e23a83b023e696d850 Mon Sep 17 00:00:00 2001
From: ljz <470699397@qq.com>
Date: Tue, 16 Dec 2025 10:25:05 +0800
Subject: [PATCH 09/13] rm target-device

---
 engineV2-README.md             | 52 +++++++++-------------------------
 engineV2.py                    | 12 --------
 tester/paddle_device_vs_gpu.py | 13 ++++-----
 3 files changed, 19 insertions(+), 58 deletions(-)

diff --git a/engineV2-README.md b/engineV2-README.md
index 5709503e..5cd8e8ab 100644
--- a/engineV2-README.md
+++ b/engineV2-README.md
@@ -87,7 +87,6 @@
 | `--show_runtime_status`          | bool  | 是否实时显示当前的测试进度（默认 True）                                               |
 | `--random_seed`                  | int   | numpy random的随机种子(默认为0，此时不会显式设置numpy random的seed)                   |
 | `--custom_device_vs_gpu`        | str   | 运行自定义设备与GPU的精度对比测试：`upload` 或 `download`（默认 None）                  |
-| `--target_device_type`           | str   | 目标设备类型（如 `xpu`），仅在 `--custom_device_vs_gpu=download` 时使用                |
 | `--bitwise_alignment`            | bool  | 是否进行诸位对齐对比，开启后所有的api的精度对比都按照atol=0.0,rtol = 0.0的精度对比结果|
 
 
@@ -129,23 +128,24 @@ python engineV2.py --accuracy=True --api_config_file="tester/api_config/api_conf
 
 #### 功能说明
 
-`APITestPaddleDeviceVSGPU` 类支持跨设备的精度对比测试，特别适用于自定义设备（Custom Device）与 GPU 的一致性验证。该功能分为两个模式：
+`APITestPaddleDeviceVSGPU` 类支持跨设备的精度对比测试，目前主要面向 **GPU 上传 + XPU（或其他设备）下载对比** 这一典型场景。该功能分为两个模式：
 
-- **Upload 模式**：在源设备（如 GPU）上执行测试，保存结果到本地，然后上传到 BOS 云存储
-- **Download 模式**：从 BOS 云存储下载参考数据，在目标设备（如 XPU）上执行测试，与参考数据进行精度对比
+- **Upload 模式（GPU 侧）**：在 GPU 上执行测试，保存结果到本地，然后上传到 BOS 云存储
+- **Download 模式（XPU/其他设备侧）**：在 XPU 或其他设备上执行测试，从 BOS 下载 GPU 侧的参考数据进行精度对比
 
 #### 工作流程
 
-1. **Upload 模式工作流**：
-   - 在当前设备上执行 Paddle API 测试（可以是 GPU、XPU 或其他自定义设备）
+1. **Upload 模式工作流（GPU 侧）**：
+   - 在 GPU 设备上执行 Paddle API 测试
    - 保存 Forward 输出和 Backward 梯度到本地 PDTensor 文件
-   - 文件名自动包含当前设备类型标识（如 `xpu-1210-xxx.pdtensor`）
+   - 文件名依赖随机种子与配置哈希（如 `1210-xxx.pdtensor`）
    - 使用 bcecmd 工具将文件上传到 BOS 云存储
 
-2. **Download 模式工作流**：
-   - 指定 `--target_device_type` 参数，从 BOS 云存储下载该设备的参考数据
-   - 在当前设备上执行相同的 Paddle API 测试
-   - 对比 Forward 输出和 Backward 梯度，验证与参考设备的精度一致性
+2. **Download 模式工作流（XPU/其他设备侧）**：
+   - 在 XPU 或其他设备上执行相同的 Paddle API 测试
+   - 使用与 GPU 侧上传时一致的随机种子和配置，构造同名 PDTensor 文件名
+   - 从 BOS 云存储下载对应的 GPU 参考数据
+   - 对比 Forward 输出和 Backward 梯度，验证与 GPU 的精度一致性
 
 #### 配置文件设置
 
@@ -166,42 +166,18 @@ bcecmd_path: "./bcecmd"
 ```
 
 #### 命令示例
-
-**场景 1：在 XPU 上执行测试并上传结果**
-```bash
-# 在 XPU 设备上执行，生成 xpu-1210-xxx.pdtensor 文件并上传到 BOS
-python engineV2.py --custom_device_vs_gpu=upload \
-  --random_seed=1210 \
-  --api_config_file="./test1.txt" \
-  --gpu_ids=7
-```
-
-**场景 2：在 GPU 上下载 XPU 的参考数据并进行精度对比**
-```bash
-# 在 GPU 设备上执行，从 BOS 下载 XPU 的参考数据（xpu-1210-xxx.pdtensor）
-# 然后在 GPU 上运行相同的测试，对比结果验证精度一致性
-python engineV2.py --custom_device_vs_gpu=download \
-  --target_device_type=xpu \
-  --random_seed=1210 \
-  --api_config_file="./test1.txt" \
-  --gpu_ids=7
-```
-
-**场景 3：在 GPU 上执行测试并上传结果**
+**在 GPU 上执行测试并上传结果**
 ```bash
-# 在 GPU 设备上执行，生成 gpu-1210-xxx.pdtensor 文件并上传到 BOS
+# 在 GPU 设备上执行，生成1210-xxx.pdtensor 文件并上传到 BOS
 python engineV2.py --custom_device_vs_gpu=upload \
   --random_seed=1210 \
   --api_config_file="./test1.txt" \
   --gpu_ids=7
 ```
 
-**场景 4：在 XPU 上下载 GPU 的参考数据并进行精度对比**
+**在 XPU 上下载 GPU 的参考数据并进行精度对比**
 ```bash
-# 在 XPU 设备上执行，从 BOS 下载 GPU 的参考数据（gpu-1210-xxx.pdtensor）
-# 然后在 XPU 上运行相同的测试，对比结果验证精度一致性
 python engineV2.py --custom_device_vs_gpu=download \
-  --target_device_type=gpu \
   --random_seed=1210 \
   --api_config_file="./test1.txt" \
   --gpu_ids=7
diff --git a/engineV2.py b/engineV2.py
index ebdeb869..2b6a7a2d 100644
--- a/engineV2.py
+++ b/engineV2.py
@@ -49,7 +49,6 @@
     "test_tol",
     "operation_mode",
     "bos_path",
-    "target_device_type",
     "random_seed",
     "bos_conf_path",
     "bcecmd_path",
@@ -675,12 +674,6 @@ def main():
         default=None,
         help="test paddle api on custom device vs GPU: 'upload' or 'download'",
     )
-    parser.add_argument(
-        "--target_device_type",
-        type=str,
-        choices=["gpu", "xpu", "iluvatar_gpu"],
-        help="Target device type for download mode",
-    )
     parser.add_argument(
         "--bitwise_alignment",
         type=bool,
@@ -754,9 +747,6 @@ def main():
             print(f"Failed to load BOS config file {bos_config_path}: {e}", flush=True)
             return
         
-        if options.custom_device_vs_gpu == "download" and not options.target_device_type:
-            print("--target_device_type is required in download mode", flush=True)
-            return
     if options.test_tol and not options.accuracy:
         print(f"--test_tol takes effect when --accuracy is True.", flush=True)
     if options.test_backward and not options.paddle_cinn:
@@ -824,8 +814,6 @@ def main():
                 "atol": options.atol,
                 "rtol": options.rtol,
             }
-            if options.target_device_type:
-                kwargs["target_device_type"] = options.target_device_type
             case = test_class(api_config, **kwargs)
         elif options.accuracy:
             case = test_class(
diff --git a/tester/paddle_device_vs_gpu.py b/tester/paddle_device_vs_gpu.py
index 9079cbbb..de20e451 100644
--- a/tester/paddle_device_vs_gpu.py
+++ b/tester/paddle_device_vs_gpu.py
@@ -19,7 +19,6 @@ def __init__(self, api_config, **kwargs):
         # 新增参数
         self.operation_mode = kwargs.get("operation_mode", None)
         self.bos_path = kwargs.get("bos_path", "")
-        self.target_device_type = kwargs.get("target_device_type", "")
         self.random_seed = kwargs.get("random_seed", 0)
         self.atol = kwargs.get("atol", 1e-2)
         self.rtol = kwargs.get("rtol", 1e-2)
@@ -48,11 +47,9 @@ def _get_local_device_type(self):
         from engineV2 import detect_device_type
         return detect_device_type()
 
-    def _get_filename(self, device_type=None):
-        """生成PDTensor文件名"""
-        if device_type is None:
-            device_type = self._get_local_device_type()
-        return f"{device_type}-{self.random_seed}-{self._get_config_hash()}.pdtensor"
+    def _get_filename(self):
+        """生成PDTensor文件名（不再包含设备前缀，只依赖随机种子和配置哈希）"""
+        return f"{self.random_seed}-{self._get_config_hash()}.pdtensor"
 
     def _save_tensor_locally(self, output, grads=None):
         """保存结果到本地PDTensor文件"""
@@ -374,8 +371,8 @@ def _test_download_mode(self):
             flush=True,
         )
 
-        # 确定要下载的文件名
-        target_filename = self._get_filename(self.target_device_type)
+        # 确定要下载的文件名（与 GPU 上传时保持一致）
+        target_filename = self._get_filename()
 
         # 下载文件
         downloaded_file = self._download_from_bos(target_filename)

From b4ff8949819720d542bd37b122bee1cf2fee1c50 Mon Sep 17 00:00:00 2001
From: Jingzong Liu <470699397@qq.com>
Date: Tue, 16 Dec 2025 19:16:24 +0800
Subject: [PATCH 10/13] Remove duplicate APITest cases from engineV2.py

Removed duplicate test cases from the test suite.
---
 engineV2.py | 2 --
 1 file changed, 2 deletions(-)

diff --git a/engineV2.py b/engineV2.py
index cb5c82ea..c216d3aa 100644
--- a/engineV2.py
+++ b/engineV2.py
@@ -33,8 +33,6 @@
         APITestPaddleGPUPerformance,
         APITestPaddleOnly,
         APITestPaddleTorchGPUPerformance,
-        APITestAccuracyStable,
-        APITestCustomDeviceVSCPU,
         APITestPaddleDeviceVSGPU,
         APITestTorchGPUPerformance,
     )

From cee4c1505e6415295b7e3263be37d01632f6762e Mon Sep 17 00:00:00 2001
From: Jingzong Liu <470699397@qq.com>
Date: Tue, 16 Dec 2025 19:28:52 +0800
Subject: [PATCH 11/13] Fix formatting of APITestPaddleDeviceVSGPU entry

---
 engineV2.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/engineV2.py b/engineV2.py
index c216d3aa..d9ac1209 100644
--- a/engineV2.py
+++ b/engineV2.py
@@ -393,7 +393,7 @@ def pid_exists(pid):
             APITestAccuracyStable,
             APITestCINNVSDygraph,
             APITestCustomDeviceVSCPU,
-            APITestPaddleDeviceVSGPU
+            APITestPaddleDeviceVSGPU,
             APITestPaddleGPUPerformance,
             APITestPaddleOnly,
             APITestPaddleTorchGPUPerformance,
@@ -410,7 +410,7 @@ def pid_exists(pid):
             "APITestPaddleTorchGPUPerformance": APITestPaddleTorchGPUPerformance,
             "APITestAccuracyStable": APITestAccuracyStable,
             "APITestCustomDeviceVSCPU": APITestCustomDeviceVSCPU,
-            "APITestPaddleDeviceVSGPU": APITestPaddleDeviceVSGPU
+            "APITestPaddleDeviceVSGPU": APITestPaddleDeviceVSGPU,
         }
         globals().update(test_classes)
 

From cfd88c03a695a6c539f985a04369be98602a60d7 Mon Sep 17 00:00:00 2001
From: ljz <470699397@qq.com>
Date: Tue, 16 Dec 2025 20:26:51 +0800
Subject: [PATCH 12/13] fmt

---
 engineV2.py                    | 23 ++++++++++++-----------
 tester/__init__.py             |  7 ++++---
 tester/paddle_device_vs_gpu.py | 27 ++++++++++-----------------
 3 files changed, 26 insertions(+), 31 deletions(-)

diff --git a/engineV2.py b/engineV2.py
index d9ac1209..bb7b8a74 100644
--- a/engineV2.py
+++ b/engineV2.py
@@ -30,10 +30,10 @@
         APITestAccuracyStable,
         APITestCINNVSDygraph,
         APITestCustomDeviceVSCPU,
+        APITestPaddleDeviceVSGPU,
         APITestPaddleGPUPerformance,
         APITestPaddleOnly,
         APITestPaddleTorchGPUPerformance,
-        APITestPaddleDeviceVSGPU,
         APITestTorchGPUPerformance,
     )
 
@@ -480,7 +480,7 @@ def run_test_case(api_config_str, options):
         "accuracy_stable": APITestAccuracyStable,
         "paddle_custom_device": APITestCustomDeviceVSCPU,
     }
-    
+
     # 处理 custom_device_vs_gpu 模式
     if options.custom_device_vs_gpu:
         test_class = APITestPaddleDeviceVSGPU
@@ -709,7 +709,7 @@ def main():
             flush=True,
         )
         return
-    
+
     # 处理 custom_device_vs_gpu 模式的配置
     bos_config_data = None
     if options.custom_device_vs_gpu:
@@ -718,32 +718,32 @@ def main():
         if not bos_config_path.exists():
             print(f"BOS config file not found: {bos_config_path}", flush=True)
             return
-        
+
         try:
-            with open(bos_config_path, "r", encoding="utf-8") as f:
+            with open(bos_config_path, encoding="utf-8") as f:
                 bos_config_data = yaml.safe_load(f)
-            
+
             if not bos_config_data:
                 print(f"BOS config file is empty: {bos_config_path}", flush=True)
                 return
-            
+
             # 验证必需的配置项
             required_keys = ["bos_path", "bos_conf_path", "bcecmd_path"]
             missing_keys = [key for key in required_keys if key not in bos_config_data]
             if missing_keys:
                 print(f"Missing required keys in BOS config: {missing_keys}", flush=True)
                 return
-            
+
             # 将配置添加到 options 中，以便传递给测试类
             options.operation_mode = options.custom_device_vs_gpu
             options.bos_path = bos_config_data["bos_path"]
             options.bos_conf_path = bos_config_data["bos_conf_path"]
             options.bcecmd_path = bos_config_data["bcecmd_path"]
-            
+
         except Exception as e:
             print(f"Failed to load BOS config file {bos_config_path}: {e}", flush=True)
             return
-        
+
     if options.test_tol and not options.accuracy:
         print("--test_tol takes effect when --accuracy is True.", flush=True)
     if options.test_backward and not options.paddle_cinn:
@@ -789,10 +789,11 @@ def main():
             "accuracy_stable": APITestAccuracyStable,
             "paddle_custom_device": APITestCustomDeviceVSCPU,
         }
-        
+
         # 处理 custom_device_vs_gpu 模式
         if options.custom_device_vs_gpu:
             from tester import APITestPaddleDeviceVSGPU
+
             test_class = APITestPaddleDeviceVSGPU
         else:
             test_class = next(
diff --git a/tester/__init__.py b/tester/__init__.py
index e119b4f6..e721c402 100644
--- a/tester/__init__.py
+++ b/tester/__init__.py
@@ -27,7 +27,6 @@
     from . import paddle_to_torch
     from .accuracy import APITestAccuracy
     from .accuracy_stable import APITestAccuracyStable
-    from .paddle_device_vs_gpu import APITestPaddleDeviceVSGPU
     from .api_config import (
         USE_CACHED_NUMPY,
         APIConfig,
@@ -40,6 +39,7 @@
     from .base import APITestBase
     from .paddle_cinn_vs_dygraph import APITestCINNVSDygraph
     from .paddle_device_vs_cpu import APITestCustomDeviceVSCPU
+    from .paddle_device_vs_gpu import APITestPaddleDeviceVSGPU
     from .paddle_gpu_performance import APITestPaddleGPUPerformance
     from .paddle_only import APITestPaddleOnly
     from .paddle_torch_gpu_performance import APITestPaddleTorchGPUPerformance
@@ -86,10 +86,11 @@ def __getattr__(name: str) -> Any:
         from .paddle_device_vs_cpu import APITestCustomDeviceVSCPU
 
         return APITestCustomDeviceVSCPU
-    elif name == 'APITestPaddleDeviceVSGPU':
+    elif name == "APITestPaddleDeviceVSGPU":
         from .paddle_device_vs_gpu import APITestPaddleDeviceVSGPU
+
         return APITestPaddleDeviceVSGPU
-    elif name == 'paddle_to_torch':
+    elif name == "paddle_to_torch":
         from . import paddle_to_torch
 
         return paddle_to_torch
diff --git a/tester/paddle_device_vs_gpu.py b/tester/paddle_device_vs_gpu.py
index de20e451..3694e163 100644
--- a/tester/paddle_device_vs_gpu.py
+++ b/tester/paddle_device_vs_gpu.py
@@ -1,3 +1,5 @@
+from __future__ import annotations
+
 import hashlib
 import json
 import subprocess
@@ -45,6 +47,7 @@ def _get_config_hash(self):
     def _get_local_device_type(self):
         """获取当前设备的类型，优先复用 engineV2 的检测逻辑。"""
         from engineV2 import detect_device_type
+
         return detect_device_type()
 
     def _get_filename(self):
@@ -163,15 +166,13 @@ def _run_paddle(self, device_type: str):
                 print("gen_paddle_input failed", flush=True)
                 return None, None
 
-            paddle_output = self.paddle_api(
-                *tuple(self.paddle_args), **self.paddle_kwargs
-            )
+            paddle_output = self.paddle_api(*tuple(self.paddle_args), **self.paddle_kwargs)
 
             paddle_grads = None
             if self.need_check_grad():
                 inputs_list = self.get_paddle_input_list()
-                result_outputs, result_outputs_grads = (
-                    self.gen_paddle_output_and_output_grad(paddle_output)
+                result_outputs, result_outputs_grads = self.gen_paddle_output_and_output_grad(
+                    paddle_output
                 )
                 if inputs_list and result_outputs and result_outputs_grads:
                     paddle_grads = paddle.grad(
@@ -194,9 +195,7 @@ def _run_paddle(self, device_type: str):
     def _compare_with_downloaded(self, local_output, local_grads, downloaded_tensor):
         """与下载的结果进行对比"""
         try:
-            print(
-                f"[compare] Comparing results for {self.api_config.config}", flush=True
-            )
+            print(f"[compare] Comparing results for {self.api_config.config}", flush=True)
 
             # 加载下载的数据
             remote_data = paddle.load(str(downloaded_tensor))
@@ -219,9 +218,7 @@ def _compare_with_downloaded(self, local_output, local_grads, downloaded_tensor)
                     remote_output, (list, tuple)
                 ):
                     # 列表或元组对比
-                    for i, (local_item, remote_item) in enumerate(
-                        zip(local_output, remote_output)
-                    ):
+                    for i, (local_item, remote_item) in enumerate(zip(local_output, remote_output)):
                         if isinstance(local_item, paddle.Tensor) and isinstance(
                             remote_item, paddle.Tensor
                         ):
@@ -360,9 +357,7 @@ def _test_upload_mode(self):
         # 异步上传到BOS
         self._upload_to_bos(local_path)
 
-        print(
-            f"[upload] Upload mode completed for {self.api_config.config}", flush=True
-        )
+        print(f"[upload] Upload mode completed for {self.api_config.config}", flush=True)
 
     def _test_download_mode(self):
         """Download模式：下载对比数据并验证"""
@@ -395,9 +390,7 @@ def _test_download_mode(self):
             return
 
         # 与下载的结果进行对比
-        success = self._compare_with_downloaded(
-            local_output, local_grads, downloaded_file
-        )
+        success = self._compare_with_downloaded(local_output, local_grads, downloaded_file)
 
         # 清理下载的文件
         downloaded_file.unlink(missing_ok=True)

From 9f50c16fe45c40a6a76f861e4a6f154dd46343f5 Mon Sep 17 00:00:00 2001
From: ljz <470699397@qq.com>
Date: Tue, 16 Dec 2025 22:01:04 +0800
Subject: [PATCH 13/13] option mapping

---
 engineV2-README.md |  9 ++++---
 engineV2.py        | 66 +++++++++++++++++++++++-----------------------
 2 files changed, 39 insertions(+), 36 deletions(-)

diff --git a/engineV2-README.md b/engineV2-README.md
index 5cd8e8ab..df2ab557 100644
--- a/engineV2-README.md
+++ b/engineV2-README.md
@@ -86,7 +86,8 @@
 | `--timeout`                      | int   | 单个测试用例执行超时秒数（默认 1800）                                                  |
 | `--show_runtime_status`          | bool  | 是否实时显示当前的测试进度（默认 True）                                               |
 | `--random_seed`                  | int   | numpy random的随机种子(默认为0，此时不会显式设置numpy random的seed)                   |
-| `--custom_device_vs_gpu`        | str   | 运行自定义设备与GPU的精度对比测试：`upload` 或 `download`（默认 None）                  |
+| `--custom_device_vs_gpu`        | bool  | 启用自定义设备与GPU的精度对比测试模式（默认 False）                                   |
+| `--custom_device_vs_gpu_mode`   | str   | 自定义设备与GPU对比的模式：`upload` 或 `download`（默认 `upload`）                    |
 | `--bitwise_alignment`            | bool  | 是否进行诸位对齐对比，开启后所有的api的精度对比都按照atol=0.0,rtol = 0.0的精度对比结果|
 
 
@@ -169,7 +170,8 @@ bcecmd_path: "./bcecmd"
 **在 GPU 上执行测试并上传结果**
 ```bash
 # 在 GPU 设备上执行，生成1210-xxx.pdtensor 文件并上传到 BOS
-python engineV2.py --custom_device_vs_gpu=upload \
+python engineV2.py --custom_device_vs_gpu=True \
+  --custom_device_vs_gpu_mode=upload \
   --random_seed=1210 \
   --api_config_file="./test1.txt" \
   --gpu_ids=7
@@ -177,7 +179,8 @@ python engineV2.py --custom_device_vs_gpu=upload \
 
 **在 XPU 上下载 GPU 的参考数据并进行精度对比**
 ```bash
-python engineV2.py --custom_device_vs_gpu=download \
+python engineV2.py --custom_device_vs_gpu=True \
+  --custom_device_vs_gpu_mode=download \
   --random_seed=1210 \
   --api_config_file="./test1.txt" \
   --gpu_ids=7
diff --git a/engineV2.py b/engineV2.py
index bb7b8a74..8e901239 100644
--- a/engineV2.py
+++ b/engineV2.py
@@ -479,16 +479,13 @@ def run_test_case(api_config_str, options):
         "paddle_torch_gpu_performance": APITestPaddleTorchGPUPerformance,
         "accuracy_stable": APITestAccuracyStable,
         "paddle_custom_device": APITestCustomDeviceVSCPU,
+        "custom_device_vs_gpu": APITestPaddleDeviceVSGPU,
     }
 
-    # 处理 custom_device_vs_gpu 模式
-    if options.custom_device_vs_gpu:
-        test_class = APITestPaddleDeviceVSGPU
-    else:
-        test_class = next(
-            (cls for opt, cls in option_to_class.items() if getattr(options, opt, False)),
-            APITestAccuracy,  # default fallback
-        )
+    test_class = next(
+        (cls for opt, cls in option_to_class.items() if getattr(options, opt, False)),
+        APITestAccuracy,  # default fallback
+    )
     kwargs = {k: v for k, v in vars(options).items() if k in VALID_TEST_ARGS}
     case = test_class(api_config, **kwargs)
     try:
@@ -666,10 +663,16 @@ def main():
     )
     parser.add_argument(
         "--custom_device_vs_gpu",
+        type=parse_bool,
+        default=False,
+        help="test paddle api on custom device vs GPU",
+    )
+    parser.add_argument(
+        "--custom_device_vs_gpu_mode",
         type=str,
         choices=["upload", "download"],
-        default=None,
-        help="test paddle api on custom device vs GPU: 'upload' or 'download'",
+        default="upload",
+        help="operation mode for custom_device_vs_gpu: 'upload' or 'download'",
     )
     parser.add_argument(
         "--bitwise_alignment",
@@ -692,7 +695,7 @@ def main():
         options.paddle_torch_gpu_performance,
         options.accuracy_stable,
         options.paddle_custom_device,
-        options.custom_device_vs_gpu is not None,
+        options.custom_device_vs_gpu,
     ]
     if len([m for m in mode if m is True]) != 1:
         print(
@@ -705,7 +708,7 @@ def main():
             "--paddle_torch_gpu_performance"
             "--accuracy_stable"
             "--paddle_custom_device"
-            "--custom_device_vs_gpu=upload or --custom_device_vs_gpu=download",
+            "--custom_device_vs_gpu",
             flush=True,
         )
         return
@@ -735,7 +738,7 @@ def main():
                 return
 
             # 将配置添加到 options 中，以便传递给测试类
-            options.operation_mode = options.custom_device_vs_gpu
+            options.operation_mode = options.custom_device_vs_gpu_mode
             options.bos_path = bos_config_data["bos_path"]
             options.bos_conf_path = bos_config_data["bos_conf_path"]
             options.bcecmd_path = bos_config_data["bcecmd_path"]
@@ -762,6 +765,8 @@ def main():
             APITestAccuracy,
             APITestAccuracyStable,
             APITestCINNVSDygraph,
+            APITestCustomDeviceVSCPU,
+            APITestPaddleDeviceVSGPU,
             APITestPaddleGPUPerformance,
             APITestPaddleOnly,
             APITestPaddleTorchGPUPerformance,
@@ -788,31 +793,26 @@ def main():
             "paddle_torch_gpu_performance": APITestPaddleTorchGPUPerformance,
             "accuracy_stable": APITestAccuracyStable,
             "paddle_custom_device": APITestCustomDeviceVSCPU,
+            "custom_device_vs_gpu": APITestPaddleDeviceVSGPU,
         }
 
-        # 处理 custom_device_vs_gpu 模式
-        if options.custom_device_vs_gpu:
-            from tester import APITestPaddleDeviceVSGPU
-
-            test_class = APITestPaddleDeviceVSGPU
-        else:
-            test_class = next(
-                (cls for opt, cls in option_to_class.items() if getattr(options, opt, False)),
-                APITestAccuracy,  # default fallback
-            )
+        test_class = next(
+            (cls for opt, cls in option_to_class.items() if getattr(options, opt, False)),
+            APITestAccuracy,  # default fallback
+        )
 
         if options.custom_device_vs_gpu:
             # custom_device_vs_gpu 模式需要传递额外参数
-            kwargs = {
-                "operation_mode": options.operation_mode,
-                "bos_path": options.bos_path,
-                "bos_conf_path": options.bos_conf_path,
-                "bcecmd_path": options.bcecmd_path,
-                "random_seed": options.random_seed,
-                "atol": options.atol,
-                "rtol": options.rtol,
-            }
-            case = test_class(api_config, **kwargs)
+            case = test_class(
+                api_config,
+                operation_mode=options.operation_mode,
+                bos_path=options.bos_path,
+                bos_conf_path=options.bos_conf_path,
+                bcecmd_path=options.bcecmd_path,
+                random_seed=options.random_seed,
+                atol=options.atol,
+                rtol=options.rtol,
+            )
         elif options.accuracy:
             case = test_class(
                 api_config,