@@ -307,19 +307,6 @@ async def test_tool_name_in_trajectory(self, agent, model, tokenizer):
307307 )
308308 assert has_tool_reference , "Tool reference should appear in trajectory"
309309
310- async def test_chat_template_markers_present (self , agent , model , tokenizer ):
311- """Chat template markers appear in decoded trajectory."""
312- await agent .invoke_async ("Hi" )
313-
314- decoded = tokenizer .decode (model .token_manager .token_ids )
315-
316- # Qwen chat template uses these markers
317- assert "<|im_start|>" in decoded , "Should have im_start marker"
318- assert "<|im_end|>" in decoded , "Should have im_end marker"
319-
320- # Should have role markers
321- assert "system" in decoded or "user" in decoded or "assistant" in decoded
322-
323310
324311# =============================================================================
325312# TITO Within Single Invocation Tests
@@ -804,184 +791,3 @@ async def test_context_preserved_across_turns(self, model, tokenizer):
804791 f"Context may not be properly preserved."
805792 )
806793
807-
808- class TestMessageToTokenDrift :
809- """Tests for drift between agent.messages and TITO tokens.
810-
811- This tests that reconstructing tokens from agent.messages via apply_chat_template
812- produces IDENTICAL tokens to what TITO recorded during generation.
813-
814- This is critical for:
815- 1. Offline RL from logged messages
816- 2. Trajectory reconstruction from saved conversations
817- 3. Verifying the model saw exactly what we think it saw
818-
819- IMPORTANT LIMITATION - Thinking Models (e.g., Qwen3-4B-Thinking-2507):
820- Message-to-token reconstruction does NOT work reliably with thinking models because:
821- - Qwen3's chat template ALWAYS inserts <think>\n \n </think>\n \n before assistant content
822- - If the model generates without thinking, reconstruction adds 4 extra tokens
823- - If the model generates with thinking, template strips <think> from historical messages
824- - This is intentional template behavior (not a bug in our code)
825- - For offline RL with thinking models, you MUST use stored TITO tokens directly
826-
827- The retokenization tests (encode→decode→encode) remain the critical guarantee
828- for RL training correctness and work correctly for all model types.
829- """
830-
831- async def test_single_turn_message_token_match (self , model , tokenizer ):
832- """Single turn: formatted messages produce identical tokens to TITO.
833-
834- NOTE: Skipped for thinking models - see class docstring.
835- """
836- agent = Agent (
837- model = model ,
838- tools = [calculator ],
839- system_prompt = "Be brief." ,
840- )
841-
842- await agent .invoke_async ("What is 5 + 5?" )
843-
844- # Get TITO tokens
845- tito_tokens = model .token_manager .token_ids
846- tito_decoded = tokenizer .decode (tito_tokens )
847-
848- # Check if this is a thinking model (generates <think> blocks OR template adds them)
849- is_thinking_model = "<think>" in tito_decoded or tokenizer .decode ([151667 ]) == "<think>"
850-
851- # Reconstruct from messages using the same formatting
852- openai_messages = model .format_request_messages (agent .messages , "Be brief." )
853- tools = model ._current_tools
854-
855- formatted = tokenizer .apply_chat_template (
856- openai_messages ,
857- tokenize = False ,
858- add_generation_prompt = False ,
859- tools = tools ,
860- )
861- reconstructed_tokens = tokenizer .encode (formatted , add_special_tokens = False )
862-
863- # Strip trailing newline from reconstructed (chat template formatting, not model output)
864- if reconstructed_tokens and reconstructed_tokens [- 1 ] == 198 : # \n token
865- reconstructed_tokens = reconstructed_tokens [:- 1 ]
866-
867- tokens_match = list (tito_tokens ) == list (reconstructed_tokens )
868-
869- if is_thinking_model and not tokens_match :
870- pytest .skip (
871- "Message reconstruction not supported for thinking models. "
872- "Qwen3's chat template inserts <think></think> blocks unconditionally. "
873- "Use stored TITO tokens for offline RL with thinking models."
874- )
875-
876- assert tokens_match , (
877- f"Message-to-token drift detected!\n "
878- f"TITO: { len (tito_tokens )} tokens\n "
879- f"Reconstructed: { len (reconstructed_tokens )} tokens\n "
880- f"First diff at: { next ((i for i , (a , b ) in enumerate (zip (tito_tokens , reconstructed_tokens )) if a != b ), 'length mismatch' )} "
881- )
882-
883- async def test_multi_turn_message_token_match (self , model , tokenizer ):
884- """Multi-turn: formatted messages produce identical tokens to TITO.
885-
886- NOTE: This test is skipped for thinking models (Qwen3 base) because
887- the chat template intentionally strips <think> blocks from historical
888- assistant messages. See class docstring for details.
889- """
890- agent = Agent (
891- model = model ,
892- tools = [calculator ],
893- system_prompt = "Brief." ,
894- callback_handler = None , # Disable print callback
895- )
896-
897- await agent .invoke_async ("2+2=?" )
898- await agent .invoke_async ("3+3=?" )
899-
900- # Get TITO tokens
901- tito_tokens = model .token_manager .token_ids
902- tito_decoded = tokenizer .decode (tito_tokens )
903-
904- # Check if this is a thinking model (generates <think> blocks OR template adds them)
905- is_thinking_model = "<think>" in tito_decoded or tokenizer .decode ([151667 ]) == "<think>"
906-
907- openai_messages = model .format_request_messages (agent .messages , "Brief." )
908- tools = model ._current_tools
909-
910- formatted = tokenizer .apply_chat_template (
911- openai_messages ,
912- tokenize = False ,
913- add_generation_prompt = False ,
914- tools = tools ,
915- )
916- reconstructed_tokens = tokenizer .encode (formatted , add_special_tokens = False )
917-
918- # Strip trailing newline (chat template formatting, not model output)
919- if reconstructed_tokens and reconstructed_tokens [- 1 ] == 198 :
920- reconstructed_tokens = reconstructed_tokens [:- 1 ]
921-
922- tokens_match = list (tito_tokens ) == list (reconstructed_tokens )
923-
924- if is_thinking_model and not tokens_match :
925- pytest .skip (
926- "Message reconstruction not supported for thinking models. "
927- "Qwen3's chat template inserts <think></think> blocks unconditionally. "
928- "Use stored TITO tokens for offline RL with thinking models."
929- )
930-
931- assert tokens_match , (
932- f"Multi-turn message-to-token drift!\n "
933- f"TITO: { len (tito_tokens )} tokens, Reconstructed: { len (reconstructed_tokens )} tokens"
934- )
935-
936- async def test_tool_use_message_token_match (self , model , tokenizer ):
937- """Tool use: formatted messages with tool calls produce identical tokens.
938-
939- NOTE: Skipped for thinking models - see class docstring.
940- """
941- agent = Agent (
942- model = model ,
943- tools = [calculator ],
944- system_prompt = "Use calculator." ,
945- )
946-
947- try :
948- await agent .invoke_async ("Calculate 7 * 8." )
949- except MaxTokensReachedException :
950- pass
951-
952- # Get TITO tokens
953- tito_tokens = model .token_manager .token_ids
954- tito_decoded = tokenizer .decode (tito_tokens )
955-
956- # Check if this is a thinking model
957- is_thinking_model = "<think>" in tito_decoded or tokenizer .decode ([151667 ]) == "<think>"
958-
959- openai_messages = model .format_request_messages (agent .messages , "Use calculator." )
960- tools = model ._current_tools
961-
962- formatted = tokenizer .apply_chat_template (
963- openai_messages ,
964- tokenize = False ,
965- add_generation_prompt = False ,
966- tools = tools ,
967- )
968- reconstructed_tokens = tokenizer .encode (formatted , add_special_tokens = False )
969-
970- # Strip trailing newline (chat template formatting, not model output)
971- if reconstructed_tokens and reconstructed_tokens [- 1 ] == 198 :
972- reconstructed_tokens = reconstructed_tokens [:- 1 ]
973-
974- tokens_match = list (tito_tokens ) == list (reconstructed_tokens )
975-
976- if is_thinking_model and not tokens_match :
977- pytest .skip (
978- "Message reconstruction not supported for thinking models. "
979- "Qwen3's chat template inserts <think></think> blocks unconditionally. "
980- "Use stored TITO tokens for offline RL with thinking models."
981- )
982-
983- assert tokens_match , (
984- f"Tool use message-to-token drift!\n "
985- f"TITO: { len (tito_tokens )} tokens, Reconstructed: { len (reconstructed_tokens )} tokens"
986- )
987-
0 commit comments