From 917976defe3af2f3d9b30cf6d9eab8e8fbe0c499 Mon Sep 17 00:00:00 2001 From: austin1997 <18709560+austin1997@users.noreply.github.com> Date: Sat, 18 Apr 2026 03:33:01 +0000 Subject: [PATCH] [ROCm] Remove BF16 workarounds for PaddleOCR-VL on HIP MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The PaddleOCR-VL pipeline previously needed two ROCm-specific escape hatches: * `_keep_in_fp32_modules = ["visual", "mlp_AR"]` on PaddleOCRVLForConditionalGeneration kept the SigLIP vision tower and the multimodal projector in FP32 because BF16 layer_norm and BF16 softmax were not registered for HIP, so running the vision encoder in BF16 crashed. * Four `paddle.is_compiled_with_rocm()` blocks in `paddlex/inference/models/runners/paddle_static/runner.py` (lines 406-408, 462-464, 496-498, 505-507) called `delete_pass("conv2d_add_act_fuse_pass")` and `delete_pass("conv2d_add_fuse_pass")` because both PIR passes rewrite conv2d+add[+act] into the `fused_conv2d_add_act` op, which only has a cuDNN GPUDNN kernel — kernel dispatch then failed on ROCm. These are addressed at the framework level by the upstream Paddle BF16 fix (layer_norm + softmax registration on HIP, plus gating both PIR passes on PADDLE_WITH_CUDA so they no longer run on HIP builds). With that wheel installed, both PaddleX workarounds become unnecessary: * Drop `_keep_in_fp32_modules` so the vision encoder + multimodal projector run natively in BF16 on ROCm. End-to-end output matches the FP32-fallback path on PaddleOCR-VL-1.5 (validated on MI300X / gfx942 / ROCm 7.2). This overlaps with #5077; if #5077 lands first, the conflict is trivial. * Drop all four `delete_pass` blocks under `paddle.is_compiled_with_rocm()`. Once the framework PR lands, the two passes are no longer registered on HIP wheels, so `delete_pass` becomes a no-op there. Requires the framework BF16 PR to be merged and released; with older Paddle wheels the BF16 visual path will still crash on ROCm. CUDA behavior is unchanged — both passes remain registered under PADDLE_WITH_CUDA, and the vision encoder simply uses whatever dtype the model is loaded with. --- .../doc_vlm/modeling/paddleocr_vl/_paddleocr_vl.py | 3 --- .../inference/models/runners/paddle_static/runner.py | 12 ------------ 2 files changed, 15 deletions(-) diff --git a/paddlex/inference/models/doc_vlm/modeling/paddleocr_vl/_paddleocr_vl.py b/paddlex/inference/models/doc_vlm/modeling/paddleocr_vl/_paddleocr_vl.py index ab5a9cd87e..f7f4cac5c0 100644 --- a/paddlex/inference/models/doc_vlm/modeling/paddleocr_vl/_paddleocr_vl.py +++ b/paddlex/inference/models/doc_vlm/modeling/paddleocr_vl/_paddleocr_vl.py @@ -65,9 +65,6 @@ class PaddleOCRVLForConditionalGeneration(Ernie4_5PretrainedModel): _tied_weights_keys = ["lm_head.weight"] config_class = PaddleOCRVLConfig _no_split_modules = ["Ernie4_5DecoderLayer", "SiglipEncoderLayer"] - # Keep visual encoder in fp32 for ROCm stability (MIOpen bf16 conv has bugs) - # This also improves precision for vision processing - _keep_in_fp32_modules = ["visual", "mlp_AR"] base_model_prefix = "" def __init__(self, config): diff --git a/paddlex/inference/models/runners/paddle_static/runner.py b/paddlex/inference/models/runners/paddle_static/runner.py index 894feb3b12..9f84c4aac9 100644 --- a/paddlex/inference/models/runners/paddle_static/runner.py +++ b/paddlex/inference/models/runners/paddle_static/runner.py @@ -403,9 +403,6 @@ def _create(self): config.set_optimization_level(3) if self._model_name == "PP-DocLayoutV3": config.delete_pass("matmul_add_act_fuse_pass") - if paddle.is_compiled_with_rocm(): - config.delete_pass("conv2d_add_act_fuse_pass") - config.delete_pass("conv2d_add_fuse_pass") elif self._config["device_type"] == "npu": config.enable_custom_device("npu", self._config.get("device_id", 0)) if hasattr(config, "enable_new_ir"): @@ -459,9 +456,6 @@ def _create(self): config.disable_mkldnn() if hasattr(config, "enable_new_executor"): config.enable_new_executor() - if paddle.is_compiled_with_rocm(): - config.delete_pass("conv2d_add_act_fuse_pass") - config.delete_pass("conv2d_add_fuse_pass") elif self._config["device_type"] == "iluvatar_gpu": config.enable_custom_device( "iluvatar_gpu", int(self._config.get("device_id", 0)) @@ -493,18 +487,12 @@ def _create(self): if hasattr(config, "enable_new_executor"): config.enable_new_executor() config.set_optimization_level(3) - if paddle.is_compiled_with_rocm(): - config.delete_pass("conv2d_add_act_fuse_pass") - config.delete_pass("conv2d_add_fuse_pass") config.enable_memory_optim() for del_p in self._config.get("delete_pass", []): config.delete_pass(del_p) if not DEBUG: config.disable_glog_info() - if paddle.is_compiled_with_rocm(): - config.delete_pass("conv2d_add_act_fuse_pass") - config.delete_pass("conv2d_add_fuse_pass") predictor = paddle_inference.create_predictor(config)