Fix tvm verification and add doc string for RemoveEmptyConcat pattern callback (#2516)

chandrasekaranpradeep · web-flow · commit 4fb30b801650 · 2025-07-17T22:52:08.000+05:30
1) Added doc string for the RemoveEmptyConcat pattern callback and updated the pattern callback name from RemoveConcat to RemoveEmptyConcat based upon [this comments](#2421 (comment)). 2) The tvm verification function(i.e verify_tvm_compile) is improperly called in run_pattern_callbacks function present in `forge/forge/tvm_calls/relay/op/forge_passes.py` path and the verify_tvm_compile function is used in both run_forge_compile_passes and compile_for_forge function. To avoid cirucular import issues moved the TVM verification functions(i.e verify_tvm_compile) to forge/forge/tvm_calls/relay/op/utils.py path. The framework output should be extracted only when verify_tvm_compile config is enabled.
diff --git a/forge/forge/tvm_calls/forge_compile.py b/forge/forge/tvm_calls/forge_compile.py
@@ -32,7 +32,8 @@
 import onnx
 import onnx.numpy_helper
 from tvm.relay.expr import Tuple
-from forge.tvm_calls.relay.op.forge import verify_tvm_compile, flatten_IO, compile_for_forge, partition_for_forge
+from forge.tvm_calls.relay.op.forge import flatten_IO, compile_for_forge, partition_for_forge
+from forge.tvm_calls.relay.op.utils import verify_tvm_compile
 from jax.experimental import jax2tf
 from jax.tools.jax_to_ir import tf_wrap_with_input_names
 from transformers import FlaxPreTrainedModel
diff --git a/forge/forge/tvm_calls/forge_utils.py b/forge/forge/tvm_calls/forge_utils.py
@@ -35,7 +35,7 @@ def extract_framework_model_outputs(
 ):
     framework_outputs = []
 
-    if verify_tvm_compile:
+    if not verify_tvm_compile:
         return framework_outputs
 
     if framework == "pytorch" or framework == "paddle":
diff --git a/forge/forge/tvm_calls/relay/op/forge.py b/forge/forge/tvm_calls/relay/op/forge.py
@@ -1004,107 +1004,6 @@ def visit_call(self, call):
         super().visit_call(call)
 
 
-def get_relay_output(mod, params, inputs, target):
-    # Build and Run Relay modules with inputs as (key : tensor) pair
-    # Then, inputs dont need to be in the same order as 'mod' defines.
-    ret_type = mod["main"].checked_type.ret_type
-    with tvm.transform.PassContext(opt_level=0):
-        lib = relay.build_module.build(mod, target=target, params=params)
-    m = graph_executor.GraphModule(lib["default"](tvm.cpu(0)))
-    m.run(**inputs)
-
-    def _unflatten(flat_iter, cur_type):
-        import tvm.relay.ty as _ty
-
-        if isinstance(cur_type, _ty.TensorType):
-            return next(flat_iter)
-        if isinstance(cur_type, _ty.TupleType):
-            fields = []
-            for field_type in cur_type.fields:
-                field = _unflatten(flat_iter, field_type)
-                fields.append(field)
-            return fields
-        raise ValueError("Return type", ret_type, "contains unsupported type", cur_type)
-
-    flattened = []
-    import tvm.runtime.ndarray as _nd
-
-    for i in range(m.get_num_outputs()):
-        flattened.append(m.get_output(i).copyto(_nd.cpu(0)))
-    relay_outputs = _unflatten(iter(flattened), ret_type)
-
-    if not isinstance(relay_outputs, (list, tuple)):
-        relay_outputs = [relay_outputs]
-    relay_outputs = [x.numpy() for x in flattened]
-    return relay_outputs
-
-
-def verify_outputs(framework_outputs, relay_outputs, compile_location, rtol=1e-02, atol=1e-04, pcc=None):
-    allowed_to_fail = False
-    if len(framework_outputs) != len(relay_outputs):
-        logger.error(
-            f"Different number of outputs. Framework: {len(framework_outputs)}, TVM: {len(relay_outputs)} after {compile_location}"
-        )
-
-    for i, (fr_out, tvm_out) in enumerate(zip(framework_outputs, relay_outputs)):
-        if fr_out.shape != tvm_out.shape:
-            logger.error(
-                f"Different shapes for outputs. Framework: {fr_out.shape}, TVM: {tvm_out.shape} after {compile_location}"
-            )
-
-        if pcc is None:
-            ok = np.allclose(fr_out, tvm_out, rtol=rtol, atol=atol, equal_nan=True)
-        else:
-            pcc_value = np.min(
-                np.ma.corrcoef(np.ma.masked_invalid(fr_out.flatten()), np.ma.masked_invalid(tvm_out.flatten()))
-            )
-            if isinstance(pcc_value, np.ma.core.MaskedConstant):
-                pcc_value = 1.0
-            ok = pcc_value >= pcc
-
-        if not ok:
-            logger.error(f"Tensor mismatch on output {i} between framework and TVM after {compile_location}.")
-            logger.trace(f"Framework: (shape = {fr_out.shape}")
-            logger.trace(fr_out)
-            logger.trace(f"TVM: (shape = {tvm_out.shape}")
-            logger.trace(tvm_out)
-            logger.info(
-                "Max ATOL Delta: "
-                + "{:.3e}".format(np.max(np.abs((fr_out - tvm_out))).item())
-                + ", atol="
-                + "{}".format(atol)
-            )
-            logger.info(
-                "Max RTOL Delta: "
-                + "{:.3e}".format(np.max(np.abs((fr_out - tvm_out)) / tvm_out).item())
-                + ", rtol="
-                + "{}".format(rtol)
-            )
-            if pcc is not None:
-                logger.info(f"PCC got={pcc_value}, required={pcc}")
-            if not allowed_to_fail:
-                raise RuntimeError
-
-    logger.info(f"Verified TVM Relay outputs against framework outputs after {compile_location}")
-
-
-def verify_tvm_compile(mod, params, inputs, target, framework_outputs, compile_location, verify_cfg=None):
-    relay_outputs = get_relay_output(mod, params, inputs, target)
-
-    # Verify compile passes (original relay passes + forge passes)
-    if verify_cfg:
-        verify_outputs(
-            framework_outputs,
-            relay_outputs,
-            compile_location,
-            rtol=verify_cfg.rtol,
-            atol=verify_cfg.atol,
-            pcc=verify_cfg.pcc,
-        )
-    else:
-        verify_outputs(framework_outputs, relay_outputs, compile_location)
-
-
 class CompareWarner(DFPatternCallback):
     def __init__(self):
         super().__init__(require_type=True, rewrite_once=True)
diff --git a/forge/forge/tvm_calls/relay/op/forge_passes.py b/forge/forge/tvm_calls/relay/op/forge_passes.py
@@ -4990,7 +4990,30 @@ def callback(self, pre, post, node_map):
         return out
 
 
-class RemoveConcat(DFPatternCallback):
+class RemoveEmptyConcat(DFPatternCallback):
+    """
+    Relay pass to eliminate unnecessary `concatenate` ops involving empty tensors.
+
+    In some models (e.g., Phi-3), rotary embedding logic performs slicing on the last
+    dimension of the query tensor to split it into two parts. If the slicing boundaries
+    are incorrectly defined, this may create an empty tensor (e.g., shape with size 0
+    along the concatenation axis).
+
+    This pass identifies `concatenate` operations between two tensors where one has
+    dimension size 0 along the concatenation axis, and removes the redundant concat
+    by returning the non-empty operand directly.
+
+    This prevents downstream errors like:
+        `AssertionError: start < operandA.shape[dim]`
+
+    which occur due to operations on invalid tensor slices.
+
+    Example:
+        q_rot  = query[..., 0:96]
+        q_pass = query[..., 96:96]  # shape: (1, 32, 256, 0)
+        concat(q_rot, q_pass, axis=-1)  # → Rewritten to just q_rot
+    """
+
     def __init__(self):
         super().__init__(rewrite_once=False, require_type=True)
         self.act1 = wildcard()
@@ -5053,9 +5076,7 @@ def run_pattern_callbacks(
             raise ex
         if run_verify:
             logger.trace(f"Verifying {callback_name}")
-            tvm.relay.op.contrib.forge.forge.verify_tvm_compile(
-                relay_module, params, inputs, target, framework_outputs, callback_name, verify_cfg
-            )
+            verify_tvm_compile(relay_module, params, inputs, target, framework_outputs, callback_name, verify_cfg)
 
     return relay_module
 
@@ -5161,7 +5182,7 @@ def run_forge_compile_passes(
             SimplifyVITOnnxAttention(),
             GQABroadcastReshape(),
             RemoveDenseInputSqueeze(),
-            RemoveConcat(),
+            RemoveEmptyConcat(),
         ],
         params=params,
         inputs=inputs,
diff --git a/forge/forge/tvm_calls/relay/op/utils.py b/forge/forge/tvm_calls/relay/op/utils.py
@@ -2,13 +2,116 @@
 #
 # SPDX-License-Identifier: Apache-2.0
 
-import numpy as np
 import numpy as np
 from tvm.relay.dataflow_pattern import *
+import tvm
+from tvm import relay
+from tvm.contrib import graph_executor
 
 from loguru import logger
 
 
+def get_relay_output(mod, params, inputs, target):
+    # Build and Run Relay modules with inputs as (key : tensor) pair
+    # Then, inputs dont need to be in the same order as 'mod' defines.
+    ret_type = mod["main"].checked_type.ret_type
+    with tvm.transform.PassContext(opt_level=0):
+        lib = relay.build_module.build(mod, target=target, params=params)
+    m = graph_executor.GraphModule(lib["default"](tvm.cpu(0)))
+    m.run(**inputs)
+
+    def _unflatten(flat_iter, cur_type):
+        import tvm.relay.ty as _ty
+
+        if isinstance(cur_type, _ty.TensorType):
+            return next(flat_iter)
+        if isinstance(cur_type, _ty.TupleType):
+            fields = []
+            for field_type in cur_type.fields:
+                field = _unflatten(flat_iter, field_type)
+                fields.append(field)
+            return fields
+        raise ValueError("Return type", ret_type, "contains unsupported type", cur_type)
+
+    flattened = []
+    import tvm.runtime.ndarray as _nd
+
+    for i in range(m.get_num_outputs()):
+        flattened.append(m.get_output(i).copyto(_nd.cpu(0)))
+    relay_outputs = _unflatten(iter(flattened), ret_type)
+
+    if not isinstance(relay_outputs, (list, tuple)):
+        relay_outputs = [relay_outputs]
+    relay_outputs = [x.numpy() for x in flattened]
+    return relay_outputs
+
+
+def verify_outputs(framework_outputs, relay_outputs, compile_location, rtol=1e-02, atol=1e-04, pcc=None):
+    allowed_to_fail = False
+    if len(framework_outputs) != len(relay_outputs):
+        logger.error(
+            f"Different number of outputs. Framework: {len(framework_outputs)}, TVM: {len(relay_outputs)} after {compile_location}"
+        )
+
+    for i, (fr_out, tvm_out) in enumerate(zip(framework_outputs, relay_outputs)):
+        if fr_out.shape != tvm_out.shape:
+            logger.error(
+                f"Different shapes for outputs. Framework: {fr_out.shape}, TVM: {tvm_out.shape} after {compile_location}"
+            )
+
+        if pcc is None:
+            ok = np.allclose(fr_out, tvm_out, rtol=rtol, atol=atol, equal_nan=True)
+        else:
+            pcc_value = np.min(
+                np.ma.corrcoef(np.ma.masked_invalid(fr_out.flatten()), np.ma.masked_invalid(tvm_out.flatten()))
+            )
+            if isinstance(pcc_value, np.ma.core.MaskedConstant):
+                pcc_value = 1.0
+            ok = pcc_value >= pcc
+
+        if not ok:
+            logger.error(f"Tensor mismatch on output {i} between framework and TVM after {compile_location}.")
+            logger.trace(f"Framework: (shape = {fr_out.shape}")
+            logger.trace(fr_out)
+            logger.trace(f"TVM: (shape = {tvm_out.shape}")
+            logger.trace(tvm_out)
+            logger.info(
+                "Max ATOL Delta: "
+                + "{:.3e}".format(np.max(np.abs((fr_out - tvm_out))).item())
+                + ", atol="
+                + "{}".format(atol)
+            )
+            logger.info(
+                "Max RTOL Delta: "
+                + "{:.3e}".format(np.max(np.abs((fr_out - tvm_out)) / tvm_out).item())
+                + ", rtol="
+                + "{}".format(rtol)
+            )
+            if pcc is not None:
+                logger.info(f"PCC got={pcc_value}, required={pcc}")
+            if not allowed_to_fail:
+                raise RuntimeError
+
+    logger.info(f"Verified TVM Relay outputs against framework outputs after {compile_location}")
+
+
+def verify_tvm_compile(mod, params, inputs, target, framework_outputs, compile_location, verify_cfg=None):
+    relay_outputs = get_relay_output(mod, params, inputs, target)
+
+    # Verify compile passes (original relay passes + forge passes)
+    if verify_cfg:
+        verify_outputs(
+            framework_outputs,
+            relay_outputs,
+            compile_location,
+            rtol=verify_cfg.rtol,
+            atol=verify_cfg.atol,
+            pcc=verify_cfg.pcc,
+        )
+    else:
+        verify_outputs(framework_outputs, relay_outputs, compile_location)
+
+
 def is_unsqueeze(call):
     input_shape = call.args[0].checked_type.shape
     output_shape = call.checked_type.shape