refactor(test): remove model-specifc tests

Lawhy · Lawhy · commit b2f50fa59737 · 2026-02-08T21:19:10.000-08:00
diff --git a/tests/integration/test_agent_math500.py b/tests/integration/test_agent_math500.py
@@ -307,19 +307,6 @@ async def test_tool_name_in_trajectory(self, agent, model, tokenizer):
         )
         assert has_tool_reference, "Tool reference should appear in trajectory"
 
-    async def test_chat_template_markers_present(self, agent, model, tokenizer):
-        """Chat template markers appear in decoded trajectory."""
-        await agent.invoke_async("Hi")
-
-        decoded = tokenizer.decode(model.token_manager.token_ids)
-
-        # Qwen chat template uses these markers
-        assert "<|im_start|>" in decoded, "Should have im_start marker"
-        assert "<|im_end|>" in decoded, "Should have im_end marker"
-
-        # Should have role markers
-        assert "system" in decoded or "user" in decoded or "assistant" in decoded
-
 
 # =============================================================================
 # TITO Within Single Invocation Tests
@@ -804,184 +791,3 @@ async def test_context_preserved_across_turns(self, model, tokenizer):
                 f"Context may not be properly preserved."
             )
 
-
-class TestMessageToTokenDrift:
-    """Tests for drift between agent.messages and TITO tokens.
-
-    This tests that reconstructing tokens from agent.messages via apply_chat_template
-    produces IDENTICAL tokens to what TITO recorded during generation.
-
-    This is critical for:
-    1. Offline RL from logged messages
-    2. Trajectory reconstruction from saved conversations
-    3. Verifying the model saw exactly what we think it saw
-
-    IMPORTANT LIMITATION - Thinking Models (e.g., Qwen3-4B-Thinking-2507):
-    Message-to-token reconstruction does NOT work reliably with thinking models because:
-    - Qwen3's chat template ALWAYS inserts <think>\n\n</think>\n\n before assistant content
-    - If the model generates without thinking, reconstruction adds 4 extra tokens
-    - If the model generates with thinking, template strips <think> from historical messages
-    - This is intentional template behavior (not a bug in our code)
-    - For offline RL with thinking models, you MUST use stored TITO tokens directly
-
-    The retokenization tests (encode→decode→encode) remain the critical guarantee
-    for RL training correctness and work correctly for all model types.
-    """
-
-    async def test_single_turn_message_token_match(self, model, tokenizer):
-        """Single turn: formatted messages produce identical tokens to TITO.
-
-        NOTE: Skipped for thinking models - see class docstring.
-        """
-        agent = Agent(
-            model=model,
-            tools=[calculator],
-            system_prompt="Be brief.",
-        )
-
-        await agent.invoke_async("What is 5 + 5?")
-
-        # Get TITO tokens
-        tito_tokens = model.token_manager.token_ids
-        tito_decoded = tokenizer.decode(tito_tokens)
-
-        # Check if this is a thinking model (generates <think> blocks OR template adds them)
-        is_thinking_model = "<think>" in tito_decoded or tokenizer.decode([151667]) == "<think>"
-
-        # Reconstruct from messages using the same formatting
-        openai_messages = model.format_request_messages(agent.messages, "Be brief.")
-        tools = model._current_tools
-
-        formatted = tokenizer.apply_chat_template(
-            openai_messages,
-            tokenize=False,
-            add_generation_prompt=False,
-            tools=tools,
-        )
-        reconstructed_tokens = tokenizer.encode(formatted, add_special_tokens=False)
-
-        # Strip trailing newline from reconstructed (chat template formatting, not model output)
-        if reconstructed_tokens and reconstructed_tokens[-1] == 198:  # \n token
-            reconstructed_tokens = reconstructed_tokens[:-1]
-
-        tokens_match = list(tito_tokens) == list(reconstructed_tokens)
-
-        if is_thinking_model and not tokens_match:
-            pytest.skip(
-                "Message reconstruction not supported for thinking models. "
-                "Qwen3's chat template inserts <think></think> blocks unconditionally. "
-                "Use stored TITO tokens for offline RL with thinking models."
-            )
-
-        assert tokens_match, (
-            f"Message-to-token drift detected!\n"
-            f"TITO: {len(tito_tokens)} tokens\n"
-            f"Reconstructed: {len(reconstructed_tokens)} tokens\n"
-            f"First diff at: {next((i for i, (a, b) in enumerate(zip(tito_tokens, reconstructed_tokens)) if a != b), 'length mismatch')}"
-        )
-
-    async def test_multi_turn_message_token_match(self, model, tokenizer):
-        """Multi-turn: formatted messages produce identical tokens to TITO.
-
-        NOTE: This test is skipped for thinking models (Qwen3 base) because
-        the chat template intentionally strips <think> blocks from historical
-        assistant messages. See class docstring for details.
-        """
-        agent = Agent(
-            model=model,
-            tools=[calculator],
-            system_prompt="Brief.",
-            callback_handler=None,  # Disable print callback
-        )
-
-        await agent.invoke_async("2+2=?")
-        await agent.invoke_async("3+3=?")
-
-        # Get TITO tokens
-        tito_tokens = model.token_manager.token_ids
-        tito_decoded = tokenizer.decode(tito_tokens)
-
-        # Check if this is a thinking model (generates <think> blocks OR template adds them)
-        is_thinking_model = "<think>" in tito_decoded or tokenizer.decode([151667]) == "<think>"
-
-        openai_messages = model.format_request_messages(agent.messages, "Brief.")
-        tools = model._current_tools
-
-        formatted = tokenizer.apply_chat_template(
-            openai_messages,
-            tokenize=False,
-            add_generation_prompt=False,
-            tools=tools,
-        )
-        reconstructed_tokens = tokenizer.encode(formatted, add_special_tokens=False)
-
-        # Strip trailing newline (chat template formatting, not model output)
-        if reconstructed_tokens and reconstructed_tokens[-1] == 198:
-            reconstructed_tokens = reconstructed_tokens[:-1]
-
-        tokens_match = list(tito_tokens) == list(reconstructed_tokens)
-
-        if is_thinking_model and not tokens_match:
-            pytest.skip(
-                "Message reconstruction not supported for thinking models. "
-                "Qwen3's chat template inserts <think></think> blocks unconditionally. "
-                "Use stored TITO tokens for offline RL with thinking models."
-            )
-
-        assert tokens_match, (
-            f"Multi-turn message-to-token drift!\n"
-            f"TITO: {len(tito_tokens)} tokens, Reconstructed: {len(reconstructed_tokens)} tokens"
-        )
-
-    async def test_tool_use_message_token_match(self, model, tokenizer):
-        """Tool use: formatted messages with tool calls produce identical tokens.
-
-        NOTE: Skipped for thinking models - see class docstring.
-        """
-        agent = Agent(
-            model=model,
-            tools=[calculator],
-            system_prompt="Use calculator.",
-        )
-
-        try:
-            await agent.invoke_async("Calculate 7 * 8.")
-        except MaxTokensReachedException:
-            pass
-
-        # Get TITO tokens
-        tito_tokens = model.token_manager.token_ids
-        tito_decoded = tokenizer.decode(tito_tokens)
-
-        # Check if this is a thinking model
-        is_thinking_model = "<think>" in tito_decoded or tokenizer.decode([151667]) == "<think>"
-
-        openai_messages = model.format_request_messages(agent.messages, "Use calculator.")
-        tools = model._current_tools
-
-        formatted = tokenizer.apply_chat_template(
-            openai_messages,
-            tokenize=False,
-            add_generation_prompt=False,
-            tools=tools,
-        )
-        reconstructed_tokens = tokenizer.encode(formatted, add_special_tokens=False)
-
-        # Strip trailing newline (chat template formatting, not model output)
-        if reconstructed_tokens and reconstructed_tokens[-1] == 198:
-            reconstructed_tokens = reconstructed_tokens[:-1]
-
-        tokens_match = list(tito_tokens) == list(reconstructed_tokens)
-
-        if is_thinking_model and not tokens_match:
-            pytest.skip(
-                "Message reconstruction not supported for thinking models. "
-                "Qwen3's chat template inserts <think></think> blocks unconditionally. "
-                "Use stored TITO tokens for offline RL with thinking models."
-            )
-
-        assert tokens_match, (
-            f"Tool use message-to-token drift!\n"
-            f"TITO: {len(tito_tokens)} tokens, Reconstructed: {len(reconstructed_tokens)} tokens"
-        )
-
diff --git a/tests/integration/test_sglang_integration.py b/tests/integration/test_sglang_integration.py
@@ -18,8 +18,6 @@
 Fixtures (model, tokenizer, calculator_tool) are provided by conftest.py.
 """
 
-from strands_sglang.client import SGLangClient
-
 
 class TestStreamBasic:
     """Basic streaming tests."""
@@ -221,87 +219,3 @@ async def test_model_stream_emits_strands_events(self, model):
         assert any("messageStart" in e for e in events)
         assert any("contentBlockDelta" in e for e in events)
         assert any("messageStop" in e for e in events)
-
-
-class TestEnableThinking:
-    """Tests for enable_thinking config option (Qwen3 hybrid thinking mode)."""
-
-    async def test_enable_thinking_true_generates_think_tokens(self, tokenizer, sglang_base_url):
-        """With enable_thinking=True, Qwen3 should generate <think> content."""
-        from strands_sglang import SGLangModel
-
-        client = SGLangClient(base_url=sglang_base_url)
-        model = SGLangModel(
-            tokenizer=tokenizer,
-            client=client,
-            enable_thinking=True,
-            params={"max_new_tokens": 1024, "temperature": 0.7},
-        )
-        model.reset()
-
-        messages = [{"role": "user", "content": [{"text": "What is 2 + 2? Think step by step."}]}]
-        async for _ in model.stream(messages):
-            pass
-
-        # Decode the response
-        response = tokenizer.decode(model.token_manager.token_ids)
-
-        # For Qwen3 with thinking enabled, should have <think> content
-        # Note: This test is Qwen3-specific
-        if "Qwen3" in tokenizer.name_or_path or "qwen3" in tokenizer.name_or_path.lower():
-            assert "<think>" in response, "Qwen3 with enable_thinking=True should generate <think> tokens"
-
-    async def test_enable_thinking_false_minimal_think_tokens(self, tokenizer, sglang_base_url):
-        """With enable_thinking=False, Qwen3 should have minimal/empty <think> content."""
-        from strands_sglang import SGLangModel
-
-        client = SGLangClient(base_url=sglang_base_url)
-        model = SGLangModel(
-            tokenizer=tokenizer,
-            client=client,
-            enable_thinking=False,
-            params={"max_new_tokens": 512, "temperature": 0.7},
-        )
-        model.reset()
-
-        messages = [{"role": "user", "content": [{"text": "What is 3 + 3?"}]}]
-        async for _ in model.stream(messages):
-            pass
-
-        response = tokenizer.decode(model.token_manager.token_ids)
-
-        # For Qwen3 with thinking disabled, <think> tags may exist but should be empty
-        if "Qwen3" in tokenizer.name_or_path or "qwen3" in tokenizer.name_or_path.lower():
-            # Check that if <think> exists, it's mostly empty (just whitespace)
-            if "<think>" in response and "</think>" in response:
-                import re
-
-                think_content = re.search(r"<think>(.*?)</think>", response, re.DOTALL)
-                if think_content:
-                    # Empty or minimal thinking content (just whitespace)
-                    assert think_content.group(1).strip() == "", (
-                        f"With enable_thinking=False, <think> should be empty, got: {think_content.group(1)[:100]}"
-                    )
-
-    async def test_enable_thinking_none_default(self, tokenizer, sglang_base_url):
-        """With enable_thinking=None (default), parameter not passed to template."""
-        from strands_sglang import SGLangModel
-
-        # Default behavior - enable_thinking not set
-        client = SGLangClient(base_url=sglang_base_url)
-        model = SGLangModel(
-            tokenizer=tokenizer,
-            client=client,
-            params={"max_new_tokens": 256, "temperature": 0.7},
-        )
-
-        # Verify internal state
-        assert model._enable_thinking is None, "Default enable_thinking should be None"
-
-        model.reset()
-        messages = [{"role": "user", "content": [{"text": "Hi"}]}]
-        async for _ in model.stream(messages):
-            pass
-
-        # Should generate without error
-        assert len(model.token_manager) > 0, "Should generate tokens"