Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
63 changes: 63 additions & 0 deletions engineV2-README.md
Original file line number Diff line number Diff line change
Expand Up @@ -86,6 +86,8 @@
| `--timeout` | int | 单个测试用例执行超时秒数(默认 1800) |
| `--show_runtime_status` | bool | 是否实时显示当前的测试进度(默认 True) |
| `--random_seed` | int | numpy random的随机种子(默认为0,此时不会显式设置numpy random的seed) |
| `--custom_device_vs_gpu` | bool | 启用自定义设备与GPU的精度对比测试模式(默认 False) |
| `--custom_device_vs_gpu_mode` | str | 自定义设备与GPU对比的模式:`upload` 或 `download`(默认 `upload`) |
| `--bitwise_alignment` | bool | 是否进行诸位对齐对比,开启后所有的api的精度对比都按照atol=0.0,rtol = 0.0的精度对比结果|


Expand Down Expand Up @@ -123,6 +125,67 @@ python engineV2.py --accuracy=True --api_config_file="tester/api_config/api_conf
```
该脚本使用参数:`NUM_GPUS=-1, NUM_WORKERS_PER_GPU=-1, GPU_IDS="4,5,6,7"`,在后台运行程序,可在修改 `run.sh` 参数后使用

### 自定义设备与 GPU 精度对比测试

#### 功能说明

`APITestPaddleDeviceVSGPU` 类支持跨设备的精度对比测试,目前主要面向 **GPU 上传 + XPU(或其他设备)下载对比** 这一典型场景。该功能分为两个模式:

- **Upload 模式(GPU 侧)**:在 GPU 上执行测试,保存结果到本地,然后上传到 BOS 云存储
- **Download 模式(XPU/其他设备侧)**:在 XPU 或其他设备上执行测试,从 BOS 下载 GPU 侧的参考数据进行精度对比

#### 工作流程

1. **Upload 模式工作流(GPU 侧)**:
- 在 GPU 设备上执行 Paddle API 测试
- 保存 Forward 输出和 Backward 梯度到本地 PDTensor 文件
- 文件名依赖随机种子与配置哈希(如 `1210-xxx.pdtensor`)
- 使用 bcecmd 工具将文件上传到 BOS 云存储

2. **Download 模式工作流(XPU/其他设备侧)**:
- 在 XPU 或其他设备上执行相同的 Paddle API 测试
- 使用与 GPU 侧上传时一致的随机种子和配置,构造同名 PDTensor 文件名
- 从 BOS 云存储下载对应的 GPU 参考数据
- 对比 Forward 输出和 Backward 梯度,验证与 GPU 的精度一致性

#### 配置文件设置

首先,编辑 `tester/bos_config.yaml` 配置文件:

```yaml
# BOS 配置文件
# 用于自定义设备与 GPU 精度对比测试的云存储配置

# BOS 存储路径(如:xly-devops/liujingzong/)
bos_path: "xly-devops/liujingzong/"

# BOS 配置文件路径(bcecmd 使用的配置文件路径)
bos_conf_path: "./conf"

# bcecmd 命令行工具路径
bcecmd_path: "./bcecmd"
```

#### 命令示例
**在 GPU 上执行测试并上传结果**
```bash
# 在 GPU 设备上执行,生成1210-xxx.pdtensor 文件并上传到 BOS
python engineV2.py --custom_device_vs_gpu=True \
--custom_device_vs_gpu_mode=upload \
--random_seed=1210 \
--api_config_file="./test1.txt" \
--gpu_ids=7
```

**在 XPU 上下载 GPU 的参考数据并进行精度对比**
```bash
python engineV2.py --custom_device_vs_gpu=True \
--custom_device_vs_gpu_mode=download \
--random_seed=1210 \
--api_config_file="./test1.txt" \
--gpu_ids=7
```

## 监控方法

执行 `run.sh` 后可通过以下方式监控:
Expand Down
97 changes: 90 additions & 7 deletions engineV2.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,10 +13,12 @@
from concurrent.futures import TimeoutError, as_completed
from datetime import datetime
from multiprocessing import Lock, Manager, cpu_count, set_start_method
from pathlib import Path
from typing import TYPE_CHECKING

import numpy as np
import pynvml
import yaml
from pebble import ProcessExpired, ProcessPool

if TYPE_CHECKING:
Expand All @@ -28,6 +30,7 @@
APITestAccuracyStable,
APITestCINNVSDygraph,
APITestCustomDeviceVSCPU,
APITestPaddleDeviceVSGPU,
APITestPaddleGPUPerformance,
APITestPaddleOnly,
APITestPaddleTorchGPUPerformance,
Expand All @@ -39,7 +42,18 @@
os.environ["FLAGS_USE_SYSTEM_ALLOCATOR"] = "1"
os.environ["NVIDIA_TF32_OVERRIDE"] = "0"

VALID_TEST_ARGS = {"test_amp", "test_backward", "atol", "rtol", "test_tol"}
VALID_TEST_ARGS = {
"test_amp",
"test_backward",
"atol",
"rtol",
"test_tol",
"operation_mode",
"bos_path",
"random_seed",
"bos_conf_path",
"bcecmd_path",
}

DEVICE_TYPE = None
DEVICE_TYPE_DETECTED = False
Expand Down Expand Up @@ -123,7 +137,7 @@ def detect_device_type() -> str:
try:
out = subprocess.check_output(["ixsmi"], text=True, stderr=subprocess.STDOUT)
if any(re.match(r"^\|\s*\d+\s+Iluvatar", line) for line in out.splitlines()):
DEVICE_TYPE = "iluvatar"
DEVICE_TYPE = "iluvatar_gpu"
DEVICE_TYPE_DETECTED = True
return DEVICE_TYPE
except Exception:
Expand Down Expand Up @@ -164,7 +178,7 @@ def get_device_count() -> int:
DEVICE_COUNT = len(ids)
return DEVICE_COUNT

if device_type == "iluvatar":
if device_type == "iluvatar_gpu":
out = subprocess.check_output(["ixsmi"], text=True, stderr=subprocess.STDOUT)
ids = set()
for line in out.splitlines():
Expand Down Expand Up @@ -202,7 +216,7 @@ def _refresh_snapshot(device_type):
snapshot[dev_id] = (total_mib / 1024.0, used_mib / 1024.0)
break

elif device_type == "iluvatar":
elif device_type == "iluvatar_gpu":
out = subprocess.check_output(["ixsmi"], text=True, stderr=subprocess.STDOUT)
lines = out.splitlines()
for i, line in enumerate(lines):
Expand Down Expand Up @@ -240,7 +254,7 @@ def get_memory_info(gpu_id):
finally:
pynvml.nvmlShutdown()

if device_type in ("xpu", "iluvatar"):
if device_type in ("xpu", "iluvatar_gpu"):
_refresh_snapshot(device_type)
if _MEM_SNAPSHOT is None or gpu_id not in _MEM_SNAPSHOT:
raise RuntimeError(f"Failed to get memory info for {device_type} device {gpu_id}")
Expand Down Expand Up @@ -379,6 +393,7 @@ def pid_exists(pid):
APITestAccuracyStable,
APITestCINNVSDygraph,
APITestCustomDeviceVSCPU,
APITestPaddleDeviceVSGPU,
APITestPaddleGPUPerformance,
APITestPaddleOnly,
APITestPaddleTorchGPUPerformance,
Expand All @@ -395,6 +410,7 @@ def pid_exists(pid):
"APITestPaddleTorchGPUPerformance": APITestPaddleTorchGPUPerformance,
"APITestAccuracyStable": APITestAccuracyStable,
"APITestCustomDeviceVSCPU": APITestCustomDeviceVSCPU,
"APITestPaddleDeviceVSGPU": APITestPaddleDeviceVSGPU,
}
globals().update(test_classes)

Expand Down Expand Up @@ -463,7 +479,9 @@ def run_test_case(api_config_str, options):
"paddle_torch_gpu_performance": APITestPaddleTorchGPUPerformance,
"accuracy_stable": APITestAccuracyStable,
"paddle_custom_device": APITestCustomDeviceVSCPU,
"custom_device_vs_gpu": APITestPaddleDeviceVSGPU,
}

test_class = next(
(cls for opt, cls in option_to_class.items() if getattr(options, opt, False)),
APITestAccuracy, # default fallback
Expand Down Expand Up @@ -643,6 +661,19 @@ def main():
default=0,
help="The numpy random seed ",
)
parser.add_argument(
"--custom_device_vs_gpu",
type=parse_bool,
default=False,
help="test paddle api on custom device vs GPU",
)
parser.add_argument(
"--custom_device_vs_gpu_mode",
type=str,
choices=["upload", "download"],
default="upload",
help="operation mode for custom_device_vs_gpu: 'upload' or 'download'",
)
parser.add_argument(
"--bitwise_alignment",
type=bool,
Expand All @@ -664,6 +695,7 @@ def main():
options.paddle_torch_gpu_performance,
options.accuracy_stable,
options.paddle_custom_device,
options.custom_device_vs_gpu,
]
if len([m for m in mode if m is True]) != 1:
print(
Expand All @@ -676,10 +708,45 @@ def main():
"--paddle_torch_gpu_performance"
"--accuracy_stable"
"--paddle_custom_device"
" to True.",
"--custom_device_vs_gpu",
flush=True,
)
return

# 处理 custom_device_vs_gpu 模式的配置
bos_config_data = None
if options.custom_device_vs_gpu:
# 读取 BOS 配置文件(固定路径:tester/bos_config.yaml)
bos_config_path = Path("tester/bos_config.yaml")
if not bos_config_path.exists():
print(f"BOS config file not found: {bos_config_path}", flush=True)
return

try:
with open(bos_config_path, encoding="utf-8") as f:
bos_config_data = yaml.safe_load(f)

if not bos_config_data:
print(f"BOS config file is empty: {bos_config_path}", flush=True)
return

# 验证必需的配置项
required_keys = ["bos_path", "bos_conf_path", "bcecmd_path"]
missing_keys = [key for key in required_keys if key not in bos_config_data]
if missing_keys:
print(f"Missing required keys in BOS config: {missing_keys}", flush=True)
return

# 将配置添加到 options 中,以便传递给测试类
options.operation_mode = options.custom_device_vs_gpu_mode
options.bos_path = bos_config_data["bos_path"]
options.bos_conf_path = bos_config_data["bos_conf_path"]
options.bcecmd_path = bos_config_data["bcecmd_path"]

except Exception as e:
print(f"Failed to load BOS config file {bos_config_path}: {e}", flush=True)
return

if options.test_tol and not options.accuracy:
print("--test_tol takes effect when --accuracy is True.", flush=True)
if options.test_backward and not options.paddle_cinn:
Expand All @@ -698,6 +765,8 @@ def main():
APITestAccuracy,
APITestAccuracyStable,
APITestCINNVSDygraph,
APITestCustomDeviceVSCPU,
APITestPaddleDeviceVSGPU,
APITestPaddleGPUPerformance,
APITestPaddleOnly,
APITestPaddleTorchGPUPerformance,
Expand All @@ -724,13 +793,27 @@ def main():
"paddle_torch_gpu_performance": APITestPaddleTorchGPUPerformance,
"accuracy_stable": APITestAccuracyStable,
"paddle_custom_device": APITestCustomDeviceVSCPU,
"custom_device_vs_gpu": APITestPaddleDeviceVSGPU,
}

test_class = next(
(cls for opt, cls in option_to_class.items() if getattr(options, opt, False)),
APITestAccuracy, # default fallback
)

if options.accuracy:
if options.custom_device_vs_gpu:
# custom_device_vs_gpu 模式需要传递额外参数
case = test_class(
api_config,
operation_mode=options.operation_mode,
bos_path=options.bos_path,
bos_conf_path=options.bos_conf_path,
bcecmd_path=options.bcecmd_path,
random_seed=options.random_seed,
atol=options.atol,
rtol=options.rtol,
)
elif options.accuracy:
case = test_class(
api_config,
test_amp=options.test_amp,
Expand Down
6 changes: 6 additions & 0 deletions tester/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,7 @@
"APITestBase",
"APITestCINNVSDygraph",
"APITestCustomDeviceVSCPU",
"APITestPaddleDeviceVSGPU",
"APITestPaddleGPUPerformance",
"APITestPaddleOnly",
"APITestPaddleTorchGPUPerformance",
Expand Down Expand Up @@ -38,6 +39,7 @@
from .base import APITestBase
from .paddle_cinn_vs_dygraph import APITestCINNVSDygraph
from .paddle_device_vs_cpu import APITestCustomDeviceVSCPU
from .paddle_device_vs_gpu import APITestPaddleDeviceVSGPU
from .paddle_gpu_performance import APITestPaddleGPUPerformance
from .paddle_only import APITestPaddleOnly
from .paddle_torch_gpu_performance import APITestPaddleTorchGPUPerformance
Expand Down Expand Up @@ -84,6 +86,10 @@ def __getattr__(name: str) -> Any:
from .paddle_device_vs_cpu import APITestCustomDeviceVSCPU

return APITestCustomDeviceVSCPU
elif name == "APITestPaddleDeviceVSGPU":
from .paddle_device_vs_gpu import APITestPaddleDeviceVSGPU

return APITestPaddleDeviceVSGPU
elif name == "paddle_to_torch":
from . import paddle_to_torch

Expand Down
12 changes: 12 additions & 0 deletions tester/bos_config.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,12 @@
# BOS 配置文件
# 用于自定义设备与 GPU 精度对比测试的云存储配置

# BOS 存储路径(如:xly-devops/liujingzong/)
bos_path: "xly-devops/liujingzong/"

# BOS 配置文件路径(bcecmd 使用的配置文件路径)
bos_conf_path: "./conf"

# bcecmd 命令行工具路径
bcecmd_path: "./bcecmd"

Loading