review feedback

tianleiwu · tianleiwu · commit bfaa406512dd · 2025-03-21T20:27:51.000Z
diff --git a/onnxruntime/contrib_ops/cpu/transformers/subgraph_t5_decoder.cc b/onnxruntime/contrib_ops/cpu/transformers/subgraph_t5_decoder.cc
@@ -20,9 +20,9 @@ namespace transformers {
 
    Inputs:
       input_ids: int32 (B, 1)
-      encoder_input_ids: int32 (B, encode_sequence_length) (optional)
+      encoder_input_ids: int32 (B, encode_sequence_length) (optional for old format; removed in new format)
       encoder_attention_mask: int32 (B, encode_sequence_length)
-      encoder_hidden_states: (B, encode_sequence_length, encoder_hidden_size) (optional)
+      encoder_hidden_states: (B, encode_sequence_length, encoder_hidden_size) (optional for old format; removed in new format)
 
       past_key_self_0: (B, num_heads, past_decode_sequence_length, head_size)
       past_value_self_0: (B, num_heads, past_decode_sequence_length, head_size)
@@ -147,7 +147,8 @@ Status T5DecoderSubgraph::Validate(const std::vector<const NodeArg*>& subgraph_i
 //   decoder_feeds: input_ids, encoder_attention_mask,
 //                  present_key_self_0, present_value_self_0, ...,
 //                  present_key_cross_0, present_value_cross_0, ...
-
+//                  past_seq_len (optional), num_beams (optional), cache_indirection (optional)
+//
 // Old format:
 //   encoder feeds: encoder_input_ids, encoder_attention_mask, decoder_input_ids (with start tokens)
 //   encoder fetches: logits, encoder_hidden_states,
@@ -157,7 +158,6 @@ Status T5DecoderSubgraph::Validate(const std::vector<const NodeArg*>& subgraph_i
 //                  present_key_self_0, present_value_self_0, ...,
 //                  present_key_cross_0, present_value_cross_0, ...
 //                  past_seq_len (optional), num_beams (optional), cache_indirection (optional)
-
 Status T5DecoderSubgraph::CreateInitialFeeds(
     AllocatorPtr cpu_allocator,
     gsl::span<const int32_t> beam_next_tokens,
diff --git a/onnxruntime/python/tools/transformers/models/t5/convert_to_onnx.py b/onnxruntime/python/tools/transformers/models/t5/convert_to_onnx.py
@@ -212,7 +212,7 @@ def export_onnx_models(
         else:
             logger.info(f"Skip exporting: existed ONNX model {onnx_path}")
 
-        # Optimize ONNX graph. Note that we have not implemented graph optimization for T5 yet.
+        # Optimize ONNX graph.
         if optimize_onnx or precision != Precision.FLOAT32:
             onnx_shape_path = None
             if shape_infer_before_optimization:
diff --git a/onnxruntime/python/tools/transformers/onnx_model_t5.py b/onnxruntime/python/tools/transformers/onnx_model_t5.py
@@ -237,7 +237,7 @@ def fuse_t5_encoder(self, softmax_node, input_name_to_nodes, output_name_to_node
         )
         if qkv_nodes is None:
             return False
-        matmul_qkv, _transpose_qkv, reshape_qkv = qkv_nodes
+        matmul_qkv, _, reshape_qkv = qkv_nodes
 
         qkv_shape_nodes = self.model.match_parent_path(
             reshape_qkv,
@@ -298,7 +298,7 @@ def fuse_t5_encoder(self, softmax_node, input_name_to_nodes, output_name_to_node
                     output_name_to_node,
                 )
                 if mask_nodes is None:
-                    return
+                    return False
             mul_node = mask_nodes[2]
 
         _, mul_val = self.model.get_constant_input(mul_node)
@@ -357,7 +357,7 @@ def fuse_t5_encoder(self, softmax_node, input_name_to_nodes, output_name_to_node
         )
         if k_nodes is None:
             return False
-        _, reshape_k, matmul_k = k_nodes
+        _, _, matmul_k = k_nodes
         # todo: check reshape_k parent nodes
 
         q_nodes = self.model.match_parent_path(
@@ -368,7 +368,7 @@ def fuse_t5_encoder(self, softmax_node, input_name_to_nodes, output_name_to_node
         if q_nodes is None:
             return False
 
-        transpose_q, reshape_q, matmul_q = q_nodes
+        _, reshape_q, matmul_q = q_nodes
         # todo: check reshape_q parent nodes
 
         if matmul_q.input[0] != input_shape_node.input[0]:
@@ -690,6 +690,7 @@ def fuse(self, node, input_name_to_nodes, output_name_to_node):
 
         gather = compute_bias_nodes[5]
         where = compute_bias_nodes[-1]
+        slice = compute_bias_nodes[2]
         unsqueeze = compute_bias_nodes[3]
 
         # Current fusion will not remove the node until the graph is processed.
@@ -790,10 +791,8 @@ def fuse(self, node, input_name_to_nodes, output_name_to_node):
         #    Unsqueeze(axes=0)    Cast(to=int64)
         #                   \     /
         #                     Sub
-        #
-        # Founatutionally, there is still Slice to get last seq_len rows so end result is same.
-        #
-        # But need to be careful that the shape of some intermediate nodes are changed.
+        # Currently, there is still Slice to get last seq_len rows so end result is same.
+        # But need to be careful that the shape of bias tensor is changed before Slice.
         #
         # RelativePositionBias operator requires query_length == key_length so we shall pass in total_seq_len.
         # Here we get the end value of the Range node as length to pass to the RelativePositionBias node.
@@ -802,20 +801,21 @@ def fuse(self, node, input_name_to_nodes, output_name_to_node):
         #       only compute seq_len rows, then we can remove the Slice after the RelativePositionBias node.
         inputs = [bias_table.name, range_node.input[1], range_node.input[1]]
 
-        outputs = [unsqueeze.output[0]]
+        # Use a new tensor name since the shape might be different as mentioned above.
+        bias_output = node_name + "_rel_pos_bias"
+        slice.input[0] = bias_output
+
         rpb_node = helper.make_node(
             "RelativePositionBias",
             inputs=inputs,
-            outputs=outputs,
+            outputs=[bias_output],
             name=node_name,
         )
         rpb_node.domain = "com.microsoft"
         rpb_node.attribute.extend([helper.make_attribute("max_distance", max_distance)])
         rpb_node.attribute.extend([helper.make_attribute("is_bidirectional", is_bidirectional)])
         self.node_name_to_graph_name[rpb_node.name] = self.this_graph_name
         self.nodes_to_add.append(rpb_node)
-
-        self.nodes_to_remove.append(unsqueeze)
         self.prune_graph = True
 
 
diff --git a/onnxruntime/test/python/transformers/test_generation.py b/onnxruntime/test/python/transformers/test_generation.py
@@ -196,7 +196,7 @@ def get_tiny_t5_model_dir():
 
 
 class TestBeamSearchT5(unittest.TestCase):
-    """Test BeamSearch for T5 model"""
+    """Test BeamSearch for T5 model with fp32 in CPU"""
 
     def setUp(self):
         tiny_model_dir = get_tiny_t5_model_dir()
@@ -215,8 +215,6 @@ def setUp(self):
             "--repetition_penalty 2.0",
         ]
 
-        self.enable_cuda = torch.cuda.is_available() and "CUDAExecutionProvider" in get_available_providers()
-
         export_t5_onnx_models(
             self.model_name,
             os.path.join(".", "cache_models"),
@@ -263,13 +261,6 @@ def run_beam_search(self, extra_arguments: str):
         result = run(arguments)
         self.assertTrue(result["parity"], f"ORT and PyTorch result is different on CPU for arguments {arguments}")
 
-        # Test GPU
-        if self.enable_cuda:
-            if "--use_gpu" not in arguments:
-                arguments.append("--use_gpu")
-            result = run(arguments)
-            self.assertTrue(result["parity"], f"ORT and PyTorch result is different on GPU for arguments {arguments}")
-
         os.remove(self.beam_search_onnx_path)
 
     def test_return_sequences(self):
@@ -333,6 +324,7 @@ def check_encoder_fusion(self):
         onnx_model = OnnxModel(model)
         op_counters = onnx_model.get_operator_statistics()
         print("encoder ops", op_counters)
+
         expected_node_count = {
             "RelativePositionBias": 1,
             "SimplifiedLayerNormalization": 5 if use_tiny_model else 13,
@@ -351,7 +343,7 @@ def check_decoder_fusion(self):
 
         onnx_model = OnnxModel(model)
         op_counters = onnx_model.get_operator_statistics()
-        print("decoder opators", op_counters)
+        print("decoder ops", op_counters)
 
         expected_node_count = {
             "RelativePositionBias": 1,
diff --git a/onnxruntime/test/testdata/transformers/tiny_t5/tiny_t5.py b/onnxruntime/test/testdata/transformers/tiny_t5/tiny_t5.py
@@ -14,8 +14,6 @@
 save_directory = "tiny_t5"
 model_name = "google-t5/t5-small"
 
-model = T5ForConditionalGeneration.from_pretrained(model_name)
-
 config = T5Config.from_pretrained(model_name)
 
 config.num_heads = 2