PFCCLab · DanielSun11 · Dec 17, 2025 · Dec 3, 2025 · Dec 3, 2025 · Dec 5, 2025
diff --git a/engineV2-README.md b/engineV2-README.md
@@ -86,6 +86,8 @@
 | `--timeout`                      | int   | 单个测试用例执行超时秒数（默认 1800）                                                  |
 | `--show_runtime_status`          | bool  | 是否实时显示当前的测试进度（默认 True）                                               |
 | `--random_seed`                  | int   | numpy random的随机种子(默认为0，此时不会显式设置numpy random的seed)                   |
+| `--custom_device_vs_gpu`        | bool  | 启用自定义设备与GPU的精度对比测试模式（默认 False）                                   |
+| `--custom_device_vs_gpu_mode`   | str   | 自定义设备与GPU对比的模式：`upload` 或 `download`（默认 `upload`）                    |
 | `--bitwise_alignment`            | bool  | 是否进行诸位对齐对比，开启后所有的api的精度对比都按照atol=0.0,rtol = 0.0的精度对比结果|
 
 
@@ -123,6 +125,67 @@ python engineV2.py --accuracy=True --api_config_file="tester/api_config/api_conf
 ```
 该脚本使用参数：`NUM_GPUS=-1, NUM_WORKERS_PER_GPU=-1, GPU_IDS="4,5,6,7"`，在后台运行程序，可在修改 `run.sh` 参数后使用
 
+### 自定义设备与 GPU 精度对比测试
+
+#### 功能说明
+
+`APITestPaddleDeviceVSGPU` 类支持跨设备的精度对比测试，目前主要面向 **GPU 上传 + XPU（或其他设备）下载对比** 这一典型场景。该功能分为两个模式：
+
+- **Upload 模式（GPU 侧）**：在 GPU 上执行测试，保存结果到本地，然后上传到 BOS 云存储
+- **Download 模式（XPU/其他设备侧）**：在 XPU 或其他设备上执行测试，从 BOS 下载 GPU 侧的参考数据进行精度对比
+
+#### 工作流程
+
+1. **Upload 模式工作流（GPU 侧）**：
+   - 在 GPU 设备上执行 Paddle API 测试
+   - 保存 Forward 输出和 Backward 梯度到本地 PDTensor 文件
+   - 文件名依赖随机种子与配置哈希（如 `1210-xxx.pdtensor`）
+   - 使用 bcecmd 工具将文件上传到 BOS 云存储
+
+2. **Download 模式工作流（XPU/其他设备侧）**：
+   - 在 XPU 或其他设备上执行相同的 Paddle API 测试
+   - 使用与 GPU 侧上传时一致的随机种子和配置，构造同名 PDTensor 文件名
+   - 从 BOS 云存储下载对应的 GPU 参考数据
+   - 对比 Forward 输出和 Backward 梯度，验证与 GPU 的精度一致性
+
+#### 配置文件设置
+
+首先，编辑 `tester/bos_config.yaml` 配置文件：
+
+```yaml
+# BOS 配置文件
+# 用于自定义设备与 GPU 精度对比测试的云存储配置
+
+# BOS 存储路径（如：xly-devops/liujingzong/）
+bos_path: "xly-devops/liujingzong/"
+
+# BOS 配置文件路径（bcecmd 使用的配置文件路径）
+bos_conf_path: "./conf"
+
+# bcecmd 命令行工具路径
+bcecmd_path: "./bcecmd"
+```
+
+#### 命令示例
+**在 GPU 上执行测试并上传结果**
+```bash
+# 在 GPU 设备上执行，生成1210-xxx.pdtensor 文件并上传到 BOS
+python engineV2.py --custom_device_vs_gpu=True \
+  --custom_device_vs_gpu_mode=upload \
+  --random_seed=1210 \
+  --api_config_file="./test1.txt" \
+  --gpu_ids=7
+```
+
+**在 XPU 上下载 GPU 的参考数据并进行精度对比**
+```bash
+python engineV2.py --custom_device_vs_gpu=True \
+  --custom_device_vs_gpu_mode=download \
+  --random_seed=1210 \
+  --api_config_file="./test1.txt" \
+  --gpu_ids=7
+```
+
 ## 监控方法
 
 执行 `run.sh` 后可通过以下方式监控：

diff --git a/engineV2.py b/engineV2.py
@@ -13,10 +13,12 @@
 from concurrent.futures import TimeoutError, as_completed
 from datetime import datetime
 from multiprocessing import Lock, Manager, cpu_count, set_start_method
+from pathlib import Path
 from typing import TYPE_CHECKING
 
 import numpy as np
 import pynvml
+import yaml
 from pebble import ProcessExpired, ProcessPool
 
 if TYPE_CHECKING:
@@ -28,6 +30,7 @@
         APITestAccuracyStable,
         APITestCINNVSDygraph,
         APITestCustomDeviceVSCPU,
+        APITestPaddleDeviceVSGPU,
         APITestPaddleGPUPerformance,
         APITestPaddleOnly,
         APITestPaddleTorchGPUPerformance,
@@ -39,7 +42,18 @@
 os.environ["FLAGS_USE_SYSTEM_ALLOCATOR"] = "1"
 os.environ["NVIDIA_TF32_OVERRIDE"] = "0"
 
-VALID_TEST_ARGS = {"test_amp", "test_backward", "atol", "rtol", "test_tol"}
+VALID_TEST_ARGS = {
+    "test_amp",
+    "test_backward",
+    "atol",
+    "rtol",
+    "test_tol",
+    "operation_mode",
+    "bos_path",
+    "random_seed",
+    "bos_conf_path",
+    "bcecmd_path",
+}
 
 DEVICE_TYPE = None
 DEVICE_TYPE_DETECTED = False
@@ -123,7 +137,7 @@ def detect_device_type() -> str:
         try:
             out = subprocess.check_output(["ixsmi"], text=True, stderr=subprocess.STDOUT)
             if any(re.match(r"^\|\s*\d+\s+Iluvatar", line) for line in out.splitlines()):
-                DEVICE_TYPE = "iluvatar"
+                DEVICE_TYPE = "iluvatar_gpu"
                 DEVICE_TYPE_DETECTED = True
                 return DEVICE_TYPE
         except Exception:
@@ -164,7 +178,7 @@ def get_device_count() -> int:
         DEVICE_COUNT = len(ids)
         return DEVICE_COUNT
 
-    if device_type == "iluvatar":
+    if device_type == "iluvatar_gpu":
         out = subprocess.check_output(["ixsmi"], text=True, stderr=subprocess.STDOUT)
         ids = set()
         for line in out.splitlines():
@@ -202,7 +216,7 @@ def _refresh_snapshot(device_type):
                         snapshot[dev_id] = (total_mib / 1024.0, used_mib / 1024.0)
                         break
 
-    elif device_type == "iluvatar":
+    elif device_type == "iluvatar_gpu":
         out = subprocess.check_output(["ixsmi"], text=True, stderr=subprocess.STDOUT)
         lines = out.splitlines()
         for i, line in enumerate(lines):
@@ -240,7 +254,7 @@ def get_memory_info(gpu_id):
         finally:
             pynvml.nvmlShutdown()
 
-    if device_type in ("xpu", "iluvatar"):
+    if device_type in ("xpu", "iluvatar_gpu"):
         _refresh_snapshot(device_type)
         if _MEM_SNAPSHOT is None or gpu_id not in _MEM_SNAPSHOT:
             raise RuntimeError(f"Failed to get memory info for {device_type} device {gpu_id}")
@@ -379,6 +393,7 @@ def pid_exists(pid):
             APITestAccuracyStable,
             APITestCINNVSDygraph,
             APITestCustomDeviceVSCPU,
+            APITestPaddleDeviceVSGPU,
             APITestPaddleGPUPerformance,
             APITestPaddleOnly,
             APITestPaddleTorchGPUPerformance,
@@ -395,6 +410,7 @@ def pid_exists(pid):
             "APITestPaddleTorchGPUPerformance": APITestPaddleTorchGPUPerformance,
             "APITestAccuracyStable": APITestAccuracyStable,
             "APITestCustomDeviceVSCPU": APITestCustomDeviceVSCPU,
+            "APITestPaddleDeviceVSGPU": APITestPaddleDeviceVSGPU,
         }
         globals().update(test_classes)
 
@@ -463,7 +479,9 @@ def run_test_case(api_config_str, options):
         "paddle_torch_gpu_performance": APITestPaddleTorchGPUPerformance,
         "accuracy_stable": APITestAccuracyStable,
         "paddle_custom_device": APITestCustomDeviceVSCPU,
+        "custom_device_vs_gpu": APITestPaddleDeviceVSGPU,
     }
+
     test_class = next(
         (cls for opt, cls in option_to_class.items() if getattr(options, opt, False)),
         APITestAccuracy,  # default fallback
@@ -643,6 +661,19 @@ def main():
         default=0,
         help="The numpy random seed ",
     )
+    parser.add_argument(
+        "--custom_device_vs_gpu",
+        type=parse_bool,
+        default=False,
+        help="test paddle api on custom device vs GPU",
+    )
+    parser.add_argument(
+        "--custom_device_vs_gpu_mode",
+        type=str,
+        choices=["upload", "download"],
+        default="upload",
+        help="operation mode for custom_device_vs_gpu: 'upload' or 'download'",
+    )
     parser.add_argument(
         "--bitwise_alignment",
         type=bool,
@@ -664,6 +695,7 @@ def main():
         options.paddle_torch_gpu_performance,
         options.accuracy_stable,
         options.paddle_custom_device,
+        options.custom_device_vs_gpu,
     ]
     if len([m for m in mode if m is True]) != 1:
         print(
@@ -676,10 +708,45 @@ def main():
             "--paddle_torch_gpu_performance"
             "--accuracy_stable"
             "--paddle_custom_device"
-            " to True.",
+            "--custom_device_vs_gpu",
             flush=True,
         )
         return
+
+    # 处理 custom_device_vs_gpu 模式的配置
+    bos_config_data = None
+    if options.custom_device_vs_gpu:
+        # 读取 BOS 配置文件（固定路径：tester/bos_config.yaml）
+        bos_config_path = Path("tester/bos_config.yaml")
+        if not bos_config_path.exists():
+            print(f"BOS config file not found: {bos_config_path}", flush=True)
+            return
+
+        try:
+            with open(bos_config_path, encoding="utf-8") as f:
+                bos_config_data = yaml.safe_load(f)
+
+            if not bos_config_data:
+                print(f"BOS config file is empty: {bos_config_path}", flush=True)
+                return
+
+            # 验证必需的配置项
+            required_keys = ["bos_path", "bos_conf_path", "bcecmd_path"]
+            missing_keys = [key for key in required_keys if key not in bos_config_data]
+            if missing_keys:
+                print(f"Missing required keys in BOS config: {missing_keys}", flush=True)
+                return
+
+            # 将配置添加到 options 中，以便传递给测试类
+            options.operation_mode = options.custom_device_vs_gpu_mode
+            options.bos_path = bos_config_data["bos_path"]
+            options.bos_conf_path = bos_config_data["bos_conf_path"]
+            options.bcecmd_path = bos_config_data["bcecmd_path"]
+
+        except Exception as e:
+            print(f"Failed to load BOS config file {bos_config_path}: {e}", flush=True)
+            return
+
     if options.test_tol and not options.accuracy:
         print("--test_tol takes effect when --accuracy is True.", flush=True)
     if options.test_backward and not options.paddle_cinn:
@@ -698,6 +765,8 @@ def main():
             APITestAccuracy,
             APITestAccuracyStable,
             APITestCINNVSDygraph,
+            APITestCustomDeviceVSCPU,
+            APITestPaddleDeviceVSGPU,
             APITestPaddleGPUPerformance,
             APITestPaddleOnly,
             APITestPaddleTorchGPUPerformance,
@@ -724,13 +793,27 @@ def main():
             "paddle_torch_gpu_performance": APITestPaddleTorchGPUPerformance,
             "accuracy_stable": APITestAccuracyStable,
             "paddle_custom_device": APITestCustomDeviceVSCPU,
+            "custom_device_vs_gpu": APITestPaddleDeviceVSGPU,
         }
+
         test_class = next(
             (cls for opt, cls in option_to_class.items() if getattr(options, opt, False)),
             APITestAccuracy,  # default fallback
         )
 
-        if options.accuracy:
+        if options.custom_device_vs_gpu:
+            # custom_device_vs_gpu 模式需要传递额外参数
+            case = test_class(
+                api_config,
+                operation_mode=options.operation_mode,
+                bos_path=options.bos_path,
+                bos_conf_path=options.bos_conf_path,
+                bcecmd_path=options.bcecmd_path,
+                random_seed=options.random_seed,
+                atol=options.atol,
+                rtol=options.rtol,
+            )
+        elif options.accuracy:
             case = test_class(
                 api_config,
                 test_amp=options.test_amp,

diff --git a/tester/__init__.py b/tester/__init__.py
@@ -10,6 +10,7 @@
     "APITestBase",
     "APITestCINNVSDygraph",
     "APITestCustomDeviceVSCPU",
+    "APITestPaddleDeviceVSGPU",
     "APITestPaddleGPUPerformance",
     "APITestPaddleOnly",
     "APITestPaddleTorchGPUPerformance",
@@ -38,6 +39,7 @@
     from .base import APITestBase
     from .paddle_cinn_vs_dygraph import APITestCINNVSDygraph
     from .paddle_device_vs_cpu import APITestCustomDeviceVSCPU
+    from .paddle_device_vs_gpu import APITestPaddleDeviceVSGPU
     from .paddle_gpu_performance import APITestPaddleGPUPerformance
     from .paddle_only import APITestPaddleOnly
     from .paddle_torch_gpu_performance import APITestPaddleTorchGPUPerformance
@@ -84,6 +86,10 @@ def __getattr__(name: str) -> Any:
         from .paddle_device_vs_cpu import APITestCustomDeviceVSCPU
 
         return APITestCustomDeviceVSCPU
+    elif name == "APITestPaddleDeviceVSGPU":
+        from .paddle_device_vs_gpu import APITestPaddleDeviceVSGPU
+
+        return APITestPaddleDeviceVSGPU
     elif name == "paddle_to_torch":
         from . import paddle_to_torch
 

diff --git a/tester/bos_config.yaml b/tester/bos_config.yaml
@@ -0,0 +1,12 @@
+# BOS 配置文件
+# 用于自定义设备与 GPU 精度对比测试的云存储配置
+
+# BOS 存储路径（如：xly-devops/liujingzong/）
+bos_path: "xly-devops/liujingzong/"
+
+# BOS 配置文件路径（bcecmd 使用的配置文件路径）
+bos_conf_path: "./conf"
+
+# bcecmd 命令行工具路径
+bcecmd_path: "./bcecmd"
+