From 917976defe3af2f3d9b30cf6d9eab8e8fbe0c499 Mon Sep 17 00:00:00 2001
From: austin1997 <18709560+austin1997@users.noreply.github.com>
Date: Sat, 18 Apr 2026 03:33:01 +0000
Subject: [PATCH] [ROCm] Remove BF16 workarounds for PaddleOCR-VL on HIP
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

The PaddleOCR-VL pipeline previously needed two ROCm-specific escape hatches:

* `_keep_in_fp32_modules = ["visual", "mlp_AR"]` on
  PaddleOCRVLForConditionalGeneration kept the SigLIP vision tower and the
  multimodal projector in FP32 because BF16 layer_norm and BF16 softmax were
  not registered for HIP, so running the vision encoder in BF16 crashed.
* Four `paddle.is_compiled_with_rocm()` blocks in
  `paddlex/inference/models/runners/paddle_static/runner.py` (lines 406-408,
  462-464, 496-498, 505-507) called
  `delete_pass("conv2d_add_act_fuse_pass")` and
  `delete_pass("conv2d_add_fuse_pass")` because both PIR passes rewrite
  conv2d+add[+act] into the `fused_conv2d_add_act` op, which only has a
  cuDNN GPUDNN kernel — kernel dispatch then failed on ROCm.

These are addressed at the framework level by the upstream Paddle BF16 fix
(layer_norm + softmax registration on HIP, plus gating both PIR passes on
PADDLE_WITH_CUDA so they no longer run on HIP builds). With that wheel
installed, both PaddleX workarounds become unnecessary:

* Drop `_keep_in_fp32_modules` so the vision encoder + multimodal projector
  run natively in BF16 on ROCm. End-to-end output matches the FP32-fallback
  path on PaddleOCR-VL-1.5 (validated on MI300X / gfx942 / ROCm 7.2). This
  overlaps with #5077; if #5077 lands first, the conflict is trivial.
* Drop all four `delete_pass` blocks under `paddle.is_compiled_with_rocm()`.
  Once the framework PR lands, the two passes are no longer registered on
  HIP wheels, so `delete_pass` becomes a no-op there.

Requires the framework BF16 PR to be merged and released; with older Paddle
wheels the BF16 visual path will still crash on ROCm. CUDA behavior is
unchanged — both passes remain registered under PADDLE_WITH_CUDA, and the
vision encoder simply uses whatever dtype the model is loaded with.
---
 .../doc_vlm/modeling/paddleocr_vl/_paddleocr_vl.py   |  3 ---
 .../inference/models/runners/paddle_static/runner.py | 12 ------------
 2 files changed, 15 deletions(-)

diff --git a/paddlex/inference/models/doc_vlm/modeling/paddleocr_vl/_paddleocr_vl.py b/paddlex/inference/models/doc_vlm/modeling/paddleocr_vl/_paddleocr_vl.py
index ab5a9cd87e..f7f4cac5c0 100644
--- a/paddlex/inference/models/doc_vlm/modeling/paddleocr_vl/_paddleocr_vl.py
+++ b/paddlex/inference/models/doc_vlm/modeling/paddleocr_vl/_paddleocr_vl.py
@@ -65,9 +65,6 @@ class PaddleOCRVLForConditionalGeneration(Ernie4_5PretrainedModel):
     _tied_weights_keys = ["lm_head.weight"]
     config_class = PaddleOCRVLConfig
     _no_split_modules = ["Ernie4_5DecoderLayer", "SiglipEncoderLayer"]
-    # Keep visual encoder in fp32 for ROCm stability (MIOpen bf16 conv has bugs)
-    # This also improves precision for vision processing
-    _keep_in_fp32_modules = ["visual", "mlp_AR"]
     base_model_prefix = ""
 
     def __init__(self, config):
diff --git a/paddlex/inference/models/runners/paddle_static/runner.py b/paddlex/inference/models/runners/paddle_static/runner.py
index 894feb3b12..9f84c4aac9 100644
--- a/paddlex/inference/models/runners/paddle_static/runner.py
+++ b/paddlex/inference/models/runners/paddle_static/runner.py
@@ -403,9 +403,6 @@ def _create(self):
                 config.set_optimization_level(3)
                 if self._model_name == "PP-DocLayoutV3":
                     config.delete_pass("matmul_add_act_fuse_pass")
-                if paddle.is_compiled_with_rocm():
-                    config.delete_pass("conv2d_add_act_fuse_pass")
-                    config.delete_pass("conv2d_add_fuse_pass")
             elif self._config["device_type"] == "npu":
                 config.enable_custom_device("npu", self._config.get("device_id", 0))
                 if hasattr(config, "enable_new_ir"):
@@ -459,9 +456,6 @@ def _create(self):
                 config.disable_mkldnn()
                 if hasattr(config, "enable_new_executor"):
                     config.enable_new_executor()
-                if paddle.is_compiled_with_rocm():
-                    config.delete_pass("conv2d_add_act_fuse_pass")
-                    config.delete_pass("conv2d_add_fuse_pass")
             elif self._config["device_type"] == "iluvatar_gpu":
                 config.enable_custom_device(
                     "iluvatar_gpu", int(self._config.get("device_id", 0))
@@ -493,18 +487,12 @@ def _create(self):
                 if hasattr(config, "enable_new_executor"):
                     config.enable_new_executor()
                 config.set_optimization_level(3)
-                if paddle.is_compiled_with_rocm():
-                    config.delete_pass("conv2d_add_act_fuse_pass")
-                    config.delete_pass("conv2d_add_fuse_pass")
         config.enable_memory_optim()
         for del_p in self._config.get("delete_pass", []):
             config.delete_pass(del_p)
 
         if not DEBUG:
             config.disable_glog_info()
-        if paddle.is_compiled_with_rocm():
-            config.delete_pass("conv2d_add_act_fuse_pass")
-            config.delete_pass("conv2d_add_fuse_pass")
 
         predictor = paddle_inference.create_predictor(config)