[template] update template decode_generate_ids (#9523)

Jintao-Huang · web-flow · commit 139a3d7bb002 · 2026-06-09T19:34:19.000+08:00
diff --git a/swift/infer_engine/grpo_vllm_engine.py b/swift/infer_engine/grpo_vllm_engine.py
@@ -108,7 +108,7 @@ def _create_chat_completion_response(self, result, inputs, request_config, reque
         choices = []
         for output in result.outputs:
             output.token_ids = list(output.token_ids)
-            response = self.template.decode(output.token_ids, template_inputs=inputs['template_inputs'])
+            response = self.template.decode_generate_ids(output.token_ids, template_inputs=inputs['template_inputs'])
             logprobs = self._get_logprobs(output.logprobs, output.token_ids, request_config.top_logprobs)
             toolcall = self._get_toolcall(response)
 
diff --git a/swift/infer_engine/lmdeploy_engine.py b/swift/infer_engine/lmdeploy_engine.py
@@ -226,7 +226,7 @@ async def _infer_stream_async(
                 toolcall = None
                 if is_finished:
                     toolcall = self._get_toolcall(
-                        self.template.decode(output.token_ids, template_inputs=inputs['template_inputs']))
+                        self.template.decode_generate_ids(output.token_ids, template_inputs=inputs['template_inputs']))
                 finish_reason = self._get_finish_reason(generation_config.max_new_tokens, output.num_token,
                                                         output.status.name == 'FINISH')
                 choices = [
@@ -261,7 +261,7 @@ async def _infer_full_async(
                 async for output in generator.async_stream_infer(session_id=session_id, **inputs, **kwargs):
                     pass
 
-        response = self.template.decode(output.token_ids, template_inputs=inputs['template_inputs'])
+        response = self.template.decode_generate_ids(output.token_ids, template_inputs=inputs['template_inputs'])
         logprobs = self._get_logprobs(output.logprobs, output.token_ids, request_config.top_logprobs)
 
         usage_info = self._get_usage_info(len(inputs['input_ids']), output.num_token)
diff --git a/swift/infer_engine/sglang_engine.py b/swift/infer_engine/sglang_engine.py
@@ -185,7 +185,7 @@ def _create_chat_completion_response(self, output, inputs, return_details: bool
         assert output is not None
         meta_info = output['meta_info']
         usage_info = self._get_usage_info(meta_info['prompt_tokens'], meta_info['completion_tokens'])
-        response = self.template.decode(output['output_ids'], template_inputs=inputs['template_inputs'])
+        response = self.template.decode_generate_ids(output['output_ids'], template_inputs=inputs['template_inputs'])
         toolcall = self._get_toolcall(response)
         token_ids = output['output_ids'] if return_details else None
         choice = ChatCompletionResponseChoice(
@@ -289,7 +289,8 @@ def _create_chat_completion_stream_response(self, output, infer_streamer) -> Opt
         toolcall = None
         if is_finished:
             finish_reason = finish_reason['type']
-            toolcall = self._get_toolcall(self.template.decode(output['output_ids'], **infer_streamer.decode_kwargs))
+            toolcall = self._get_toolcall(
+                self.template.decode_generate_ids(output['output_ids'], **infer_streamer.decode_kwargs))
         meta_info = output['meta_info']
         usage_info = self._get_usage_info(meta_info['prompt_tokens'], meta_info['completion_tokens'])
         # TODO: logprobs
diff --git a/swift/infer_engine/transformers_engine.py b/swift/infer_engine/transformers_engine.py
@@ -310,7 +310,7 @@ def _model_generate(**kwargs):
                 toolcall = None
                 if is_finished[i]:
                     toolcall = self._get_toolcall(
-                        self.template.decode(generate_ids, template_inputs=template_inputs[i]))
+                        self.template.decode_generate_ids(generate_ids, template_inputs=template_inputs[i]))
                 finish_reason = self._get_finish_reason(generation_config.max_new_tokens, usage_info.completion_tokens,
                                                         is_finished[i])
 
@@ -434,7 +434,7 @@ def _infer_full(self, inputs: Dict[str, Any], *, generation_config: GenerationCo
 
                 logprobs = self._get_logprobs(logprobs_list, generate_ids, request_config.top_logprobs)
                 usage_info = self._update_usage_info(usage_info, len(generate_ids))
-                response = self.template.decode(generate_ids, template_inputs=template_inputs[i])
+                response = self.template.decode_generate_ids(generate_ids, template_inputs=template_inputs[i])
                 finish_reason = self._get_finish_reason(generation_config.max_new_tokens, len(generate_ids), True)
                 toolcall = self._get_toolcall(response)
                 token_ids = generate_ids if request_config.return_details else None
diff --git a/swift/infer_engine/utils.py b/swift/infer_engine/utils.py
@@ -87,7 +87,7 @@ def get_printable_text(self, raw_tokens: List[int], is_finished: bool) -> str:
         raw_tokens = raw_tokens[self.cache_idx:]
         if self.first_token:
             raw_tokens = []
-        response = self.template.decode(
+        response = self.template.decode_generate_ids(
             raw_tokens, is_finished=is_finished, first_token=self.first_token, **self.decode_kwargs)
         response = self._align_blank_suffix(response)
         return self._get_response(response, is_finished, len(raw_tokens))
diff --git a/swift/infer_engine/vllm_engine.py b/swift/infer_engine/vllm_engine.py
@@ -611,7 +611,7 @@ def _create_chat_completion_stream_response(self, result, request_config, reques
             toolcall = None
             if output.is_finished:
                 toolcall = self._get_toolcall(
-                    self.template.decode(output.token_ids, **infer_streamers[i].decode_kwargs))
+                    self.template.decode_generate_ids(output.token_ids, **infer_streamers[i].decode_kwargs))
 
             choice = ChatCompletionResponseStreamChoice(
                 index=i,
@@ -664,7 +664,7 @@ def _create_chat_completion_response(
         choices = []
         for output in result.outputs:
             output.token_ids = list(output.token_ids)
-            response = self.template.decode(output.token_ids, template_inputs=inputs['template_inputs'])
+            response = self.template.decode_generate_ids(output.token_ids, template_inputs=inputs['template_inputs'])
 
             # Extract reasoning content if reasoning_parser is enabled
             reasoning_content = None
diff --git a/swift/template/base.py b/swift/template/base.py
@@ -721,13 +721,13 @@ def decode_seq_cls(self, logits: torch.Tensor, top_logprobs: int):
             logprobs = [self._get_seq_cls_logprobs(pred, logprobs[i], top_logprobs) for i, pred in enumerate(preds)]
         return preds, logprobs
 
-    def decode(self,
-               generate_ids: List[int],
-               *,
-               is_finished: bool = True,
-               first_token=True,
-               template_inputs=None,
-               **kwargs) -> Any:
+    def decode_generate_ids(self,
+                            generate_ids: List[int],
+                            *,
+                            is_finished: bool = True,
+                            first_token=True,
+                            template_inputs=None,
+                            **kwargs) -> Any:
         if kwargs.get('spaces_between_special_tokens') is None:
             kwargs['spaces_between_special_tokens'] = False
         generate_ids = self.skip_stop_tokens(generate_ids, is_finished)
diff --git a/swift/template/templates/baai.py b/swift/template/templates/baai.py
@@ -115,7 +115,7 @@ def prepare_generate_kwargs(self, generate_kwargs: Dict[str, Any], *, model=None
         res['logits_processor'] = logits_processor
         return res
 
-    def decode(self, generate_ids: List[int], **kwargs) -> Any:
+    def decode_generate_ids(self, generate_ids: List[int], **kwargs) -> Any:
         mm_list = self.processor.decode(generate_ids)
         for im in mm_list:
             if not isinstance(im, Image.Image):
diff --git a/swift/template/templates/deepseek.py b/swift/template/templates/deepseek.py
@@ -186,9 +186,9 @@ def generate(self, model, *args, **kwargs):
 
             return {'sequences': generated_tokens}
 
-    def decode(self, generate_ids: List[int], **kwargs) -> Any:
+    def decode_generate_ids(self, generate_ids: List[int], **kwargs) -> Any:
         if 'template_inputs' not in kwargs or not kwargs['template_inputs'].generate_mode:
-            return super().decode(generate_ids, **kwargs)
+            return super().decode_generate_ids(generate_ids, **kwargs)
         else:
             img_size = get_env_args('img_size', int, 384)
             patch_size = 16
diff --git a/swift/template/templates/glm.py b/swift/template/templates/glm.py
@@ -33,8 +33,8 @@ def _swift_encode(self, inputs: StdTemplateInputs):
                     res_context_list[i] = res_context_list[i][:-len('\n')]
         return res_context_list, loss_scale_list, answer_len
 
-    def decode(self, *args, **kwargs):
-        response = super().decode(*args, **kwargs)
+    def decode_generate_ids(self, *args, **kwargs):
+        response = super().decode_generate_ids(*args, **kwargs)
         return response.lstrip('\n') if self.strip_newline else response
 
 
diff --git a/swift/template/templates/microsoft.py b/swift/template/templates/microsoft.py
@@ -60,8 +60,8 @@ def _post_encode(self, model: nn.Module, inputs: Dict[str, Any]) -> Dict[str, An
                 image_features, inputs_embeds)
         return {'inputs_embeds': inputs_embeds}
 
-    def decode(self, generate_ids: List[int], **kwargs) -> Any:
-        response = super().decode(generate_ids, **kwargs)
+    def decode_generate_ids(self, generate_ids: List[int], **kwargs) -> Any:
+        response = super().decode_generate_ids(generate_ids, **kwargs)
         template_inputs = kwargs.get('template_inputs')
         images = template_inputs.images
         image_size = None