@@ -158,6 +158,7 @@ def __init__(
158158
159159 self .eos_token_id = self ._config_accessor .get_eos_token_id ()
160160
161+ self ._augment_eos_with_im_end ()
161162 # Build multimodal config (only meaningful for VLM models)
162163 self .mm_config = self ._config_accessor .build_mm_config ()
163164
@@ -628,6 +629,37 @@ def shutdown(self):
628629
629630 logger .debug ("Executor shutdown complete." )
630631
632+ def _augment_eos_with_im_end (self ):
633+ """Add ``<|im_end|>`` to the EOS token list when it is present in the
634+ vocabulary but missing from the configured ``eos_token_id``.
635+
636+ Many chat models (Kimi-K2.5, Qwen, etc.) use ``<|im_end|>`` as the
637+ turn-ending token, yet their ``config.json`` only lists ``[EOS]`` as
638+ the EOS token. Without this augmentation the scheduler will never
639+ detect end-of-turn and generation will run until ``max_tokens``.
640+ """
641+ _get_vocab = getattr (self .tokenizer , "get_vocab" , None )
642+ vocab = _get_vocab () if _get_vocab else {}
643+ im_end_id = vocab .get ("<|im_end|>" )
644+ if im_end_id is None :
645+ return
646+
647+ # Normalise eos_token_id to a list for easy comparison
648+ if self .eos_token_id is None :
649+ self .eos_token_id = [im_end_id ]
650+ logger .info (f"Set eos_token_id to [{ im_end_id } ] (<|im_end|>)" )
651+ elif isinstance (self .eos_token_id , list ):
652+ if im_end_id not in self .eos_token_id :
653+ self .eos_token_id .append (im_end_id )
654+ logger .info (f"Added <|im_end|> (id={ im_end_id } ) to eos_token_id list" )
655+ elif isinstance (self .eos_token_id , int ):
656+ if self .eos_token_id != im_end_id :
657+ self .eos_token_id = [self .eos_token_id , im_end_id ]
658+ logger .info (
659+ f"Expanded eos_token_id to { self .eos_token_id } "
660+ f"(added <|im_end|> id={ im_end_id } )"
661+ )
662+
631663 def _process_text_request (self , rid : str , messages : list , raw_request : Dict ) -> list :
632664 """Process a text-only request using the tokenizer."""
633665 if self .tokenizer .chat_template :
@@ -748,6 +780,29 @@ def _handle_raw_request(self, raw_request: Dict):
748780 if "ignore_eos" in raw_sampling_params :
749781 sampling_params .ignore_eos = raw_sampling_params ["ignore_eos" ]
750782
783+ # Also read OpenAI-style top-level sampling parameters as fallback
784+ if "temperature" in raw_request and raw_sampling_params is None :
785+ sampling_params .temperature = raw_request ["temperature" ]
786+ if sampling_params .temperature == 0.0 :
787+ sampling_params .temperature = 1.0
788+ sampling_params .top_k = 1
789+ if "top_p" in raw_request and raw_sampling_params is None :
790+ sampling_params .top_p = raw_request ["top_p" ]
791+
792+ # When tools are present, add tool-call-related stop token IDs so the
793+ # scheduler halts generation at the tool-call boundary instead of
794+ # running until max_tokens.
795+ tools = raw_request .get ("tools" )
796+ if tools and self .tokenizer is not None :
797+ from parallax .utils .tokenizer_utils import get_tool_call_stop_token_ids
798+
799+ tool_stop_ids = get_tool_call_stop_token_ids (self .tokenizer )
800+ if tool_stop_ids :
801+ if sampling_params .stop_token_ids is None :
802+ sampling_params .stop_token_ids = set ()
803+ sampling_params .stop_token_ids .update (tool_stop_ids )
804+ logger .debug (f"Added tool call stop token IDs for request { rid } : { tool_stop_ids } " )
805+
751806 req = InitialRequest (
752807 request_id = rid ,
753808 output_ids = None ,
0 commit comments