Skip to content

Commit b2f50fa

Browse files
committed
refactor(test): remove model-specifc tests
1 parent ed8b5fa commit b2f50fa

File tree

2 files changed

+0
-280
lines changed

2 files changed

+0
-280
lines changed

tests/integration/test_agent_math500.py

Lines changed: 0 additions & 194 deletions
Original file line numberDiff line numberDiff line change
@@ -307,19 +307,6 @@ async def test_tool_name_in_trajectory(self, agent, model, tokenizer):
307307
)
308308
assert has_tool_reference, "Tool reference should appear in trajectory"
309309

310-
async def test_chat_template_markers_present(self, agent, model, tokenizer):
311-
"""Chat template markers appear in decoded trajectory."""
312-
await agent.invoke_async("Hi")
313-
314-
decoded = tokenizer.decode(model.token_manager.token_ids)
315-
316-
# Qwen chat template uses these markers
317-
assert "<|im_start|>" in decoded, "Should have im_start marker"
318-
assert "<|im_end|>" in decoded, "Should have im_end marker"
319-
320-
# Should have role markers
321-
assert "system" in decoded or "user" in decoded or "assistant" in decoded
322-
323310

324311
# =============================================================================
325312
# TITO Within Single Invocation Tests
@@ -804,184 +791,3 @@ async def test_context_preserved_across_turns(self, model, tokenizer):
804791
f"Context may not be properly preserved."
805792
)
806793

807-
808-
class TestMessageToTokenDrift:
809-
"""Tests for drift between agent.messages and TITO tokens.
810-
811-
This tests that reconstructing tokens from agent.messages via apply_chat_template
812-
produces IDENTICAL tokens to what TITO recorded during generation.
813-
814-
This is critical for:
815-
1. Offline RL from logged messages
816-
2. Trajectory reconstruction from saved conversations
817-
3. Verifying the model saw exactly what we think it saw
818-
819-
IMPORTANT LIMITATION - Thinking Models (e.g., Qwen3-4B-Thinking-2507):
820-
Message-to-token reconstruction does NOT work reliably with thinking models because:
821-
- Qwen3's chat template ALWAYS inserts <think>\n\n</think>\n\n before assistant content
822-
- If the model generates without thinking, reconstruction adds 4 extra tokens
823-
- If the model generates with thinking, template strips <think> from historical messages
824-
- This is intentional template behavior (not a bug in our code)
825-
- For offline RL with thinking models, you MUST use stored TITO tokens directly
826-
827-
The retokenization tests (encode→decode→encode) remain the critical guarantee
828-
for RL training correctness and work correctly for all model types.
829-
"""
830-
831-
async def test_single_turn_message_token_match(self, model, tokenizer):
832-
"""Single turn: formatted messages produce identical tokens to TITO.
833-
834-
NOTE: Skipped for thinking models - see class docstring.
835-
"""
836-
agent = Agent(
837-
model=model,
838-
tools=[calculator],
839-
system_prompt="Be brief.",
840-
)
841-
842-
await agent.invoke_async("What is 5 + 5?")
843-
844-
# Get TITO tokens
845-
tito_tokens = model.token_manager.token_ids
846-
tito_decoded = tokenizer.decode(tito_tokens)
847-
848-
# Check if this is a thinking model (generates <think> blocks OR template adds them)
849-
is_thinking_model = "<think>" in tito_decoded or tokenizer.decode([151667]) == "<think>"
850-
851-
# Reconstruct from messages using the same formatting
852-
openai_messages = model.format_request_messages(agent.messages, "Be brief.")
853-
tools = model._current_tools
854-
855-
formatted = tokenizer.apply_chat_template(
856-
openai_messages,
857-
tokenize=False,
858-
add_generation_prompt=False,
859-
tools=tools,
860-
)
861-
reconstructed_tokens = tokenizer.encode(formatted, add_special_tokens=False)
862-
863-
# Strip trailing newline from reconstructed (chat template formatting, not model output)
864-
if reconstructed_tokens and reconstructed_tokens[-1] == 198: # \n token
865-
reconstructed_tokens = reconstructed_tokens[:-1]
866-
867-
tokens_match = list(tito_tokens) == list(reconstructed_tokens)
868-
869-
if is_thinking_model and not tokens_match:
870-
pytest.skip(
871-
"Message reconstruction not supported for thinking models. "
872-
"Qwen3's chat template inserts <think></think> blocks unconditionally. "
873-
"Use stored TITO tokens for offline RL with thinking models."
874-
)
875-
876-
assert tokens_match, (
877-
f"Message-to-token drift detected!\n"
878-
f"TITO: {len(tito_tokens)} tokens\n"
879-
f"Reconstructed: {len(reconstructed_tokens)} tokens\n"
880-
f"First diff at: {next((i for i, (a, b) in enumerate(zip(tito_tokens, reconstructed_tokens)) if a != b), 'length mismatch')}"
881-
)
882-
883-
async def test_multi_turn_message_token_match(self, model, tokenizer):
884-
"""Multi-turn: formatted messages produce identical tokens to TITO.
885-
886-
NOTE: This test is skipped for thinking models (Qwen3 base) because
887-
the chat template intentionally strips <think> blocks from historical
888-
assistant messages. See class docstring for details.
889-
"""
890-
agent = Agent(
891-
model=model,
892-
tools=[calculator],
893-
system_prompt="Brief.",
894-
callback_handler=None, # Disable print callback
895-
)
896-
897-
await agent.invoke_async("2+2=?")
898-
await agent.invoke_async("3+3=?")
899-
900-
# Get TITO tokens
901-
tito_tokens = model.token_manager.token_ids
902-
tito_decoded = tokenizer.decode(tito_tokens)
903-
904-
# Check if this is a thinking model (generates <think> blocks OR template adds them)
905-
is_thinking_model = "<think>" in tito_decoded or tokenizer.decode([151667]) == "<think>"
906-
907-
openai_messages = model.format_request_messages(agent.messages, "Brief.")
908-
tools = model._current_tools
909-
910-
formatted = tokenizer.apply_chat_template(
911-
openai_messages,
912-
tokenize=False,
913-
add_generation_prompt=False,
914-
tools=tools,
915-
)
916-
reconstructed_tokens = tokenizer.encode(formatted, add_special_tokens=False)
917-
918-
# Strip trailing newline (chat template formatting, not model output)
919-
if reconstructed_tokens and reconstructed_tokens[-1] == 198:
920-
reconstructed_tokens = reconstructed_tokens[:-1]
921-
922-
tokens_match = list(tito_tokens) == list(reconstructed_tokens)
923-
924-
if is_thinking_model and not tokens_match:
925-
pytest.skip(
926-
"Message reconstruction not supported for thinking models. "
927-
"Qwen3's chat template inserts <think></think> blocks unconditionally. "
928-
"Use stored TITO tokens for offline RL with thinking models."
929-
)
930-
931-
assert tokens_match, (
932-
f"Multi-turn message-to-token drift!\n"
933-
f"TITO: {len(tito_tokens)} tokens, Reconstructed: {len(reconstructed_tokens)} tokens"
934-
)
935-
936-
async def test_tool_use_message_token_match(self, model, tokenizer):
937-
"""Tool use: formatted messages with tool calls produce identical tokens.
938-
939-
NOTE: Skipped for thinking models - see class docstring.
940-
"""
941-
agent = Agent(
942-
model=model,
943-
tools=[calculator],
944-
system_prompt="Use calculator.",
945-
)
946-
947-
try:
948-
await agent.invoke_async("Calculate 7 * 8.")
949-
except MaxTokensReachedException:
950-
pass
951-
952-
# Get TITO tokens
953-
tito_tokens = model.token_manager.token_ids
954-
tito_decoded = tokenizer.decode(tito_tokens)
955-
956-
# Check if this is a thinking model
957-
is_thinking_model = "<think>" in tito_decoded or tokenizer.decode([151667]) == "<think>"
958-
959-
openai_messages = model.format_request_messages(agent.messages, "Use calculator.")
960-
tools = model._current_tools
961-
962-
formatted = tokenizer.apply_chat_template(
963-
openai_messages,
964-
tokenize=False,
965-
add_generation_prompt=False,
966-
tools=tools,
967-
)
968-
reconstructed_tokens = tokenizer.encode(formatted, add_special_tokens=False)
969-
970-
# Strip trailing newline (chat template formatting, not model output)
971-
if reconstructed_tokens and reconstructed_tokens[-1] == 198:
972-
reconstructed_tokens = reconstructed_tokens[:-1]
973-
974-
tokens_match = list(tito_tokens) == list(reconstructed_tokens)
975-
976-
if is_thinking_model and not tokens_match:
977-
pytest.skip(
978-
"Message reconstruction not supported for thinking models. "
979-
"Qwen3's chat template inserts <think></think> blocks unconditionally. "
980-
"Use stored TITO tokens for offline RL with thinking models."
981-
)
982-
983-
assert tokens_match, (
984-
f"Tool use message-to-token drift!\n"
985-
f"TITO: {len(tito_tokens)} tokens, Reconstructed: {len(reconstructed_tokens)} tokens"
986-
)
987-

tests/integration/test_sglang_integration.py

Lines changed: 0 additions & 86 deletions
Original file line numberDiff line numberDiff line change
@@ -18,8 +18,6 @@
1818
Fixtures (model, tokenizer, calculator_tool) are provided by conftest.py.
1919
"""
2020

21-
from strands_sglang.client import SGLangClient
22-
2321

2422
class TestStreamBasic:
2523
"""Basic streaming tests."""
@@ -221,87 +219,3 @@ async def test_model_stream_emits_strands_events(self, model):
221219
assert any("messageStart" in e for e in events)
222220
assert any("contentBlockDelta" in e for e in events)
223221
assert any("messageStop" in e for e in events)
224-
225-
226-
class TestEnableThinking:
227-
"""Tests for enable_thinking config option (Qwen3 hybrid thinking mode)."""
228-
229-
async def test_enable_thinking_true_generates_think_tokens(self, tokenizer, sglang_base_url):
230-
"""With enable_thinking=True, Qwen3 should generate <think> content."""
231-
from strands_sglang import SGLangModel
232-
233-
client = SGLangClient(base_url=sglang_base_url)
234-
model = SGLangModel(
235-
tokenizer=tokenizer,
236-
client=client,
237-
enable_thinking=True,
238-
params={"max_new_tokens": 1024, "temperature": 0.7},
239-
)
240-
model.reset()
241-
242-
messages = [{"role": "user", "content": [{"text": "What is 2 + 2? Think step by step."}]}]
243-
async for _ in model.stream(messages):
244-
pass
245-
246-
# Decode the response
247-
response = tokenizer.decode(model.token_manager.token_ids)
248-
249-
# For Qwen3 with thinking enabled, should have <think> content
250-
# Note: This test is Qwen3-specific
251-
if "Qwen3" in tokenizer.name_or_path or "qwen3" in tokenizer.name_or_path.lower():
252-
assert "<think>" in response, "Qwen3 with enable_thinking=True should generate <think> tokens"
253-
254-
async def test_enable_thinking_false_minimal_think_tokens(self, tokenizer, sglang_base_url):
255-
"""With enable_thinking=False, Qwen3 should have minimal/empty <think> content."""
256-
from strands_sglang import SGLangModel
257-
258-
client = SGLangClient(base_url=sglang_base_url)
259-
model = SGLangModel(
260-
tokenizer=tokenizer,
261-
client=client,
262-
enable_thinking=False,
263-
params={"max_new_tokens": 512, "temperature": 0.7},
264-
)
265-
model.reset()
266-
267-
messages = [{"role": "user", "content": [{"text": "What is 3 + 3?"}]}]
268-
async for _ in model.stream(messages):
269-
pass
270-
271-
response = tokenizer.decode(model.token_manager.token_ids)
272-
273-
# For Qwen3 with thinking disabled, <think> tags may exist but should be empty
274-
if "Qwen3" in tokenizer.name_or_path or "qwen3" in tokenizer.name_or_path.lower():
275-
# Check that if <think> exists, it's mostly empty (just whitespace)
276-
if "<think>" in response and "</think>" in response:
277-
import re
278-
279-
think_content = re.search(r"<think>(.*?)</think>", response, re.DOTALL)
280-
if think_content:
281-
# Empty or minimal thinking content (just whitespace)
282-
assert think_content.group(1).strip() == "", (
283-
f"With enable_thinking=False, <think> should be empty, got: {think_content.group(1)[:100]}"
284-
)
285-
286-
async def test_enable_thinking_none_default(self, tokenizer, sglang_base_url):
287-
"""With enable_thinking=None (default), parameter not passed to template."""
288-
from strands_sglang import SGLangModel
289-
290-
# Default behavior - enable_thinking not set
291-
client = SGLangClient(base_url=sglang_base_url)
292-
model = SGLangModel(
293-
tokenizer=tokenizer,
294-
client=client,
295-
params={"max_new_tokens": 256, "temperature": 0.7},
296-
)
297-
298-
# Verify internal state
299-
assert model._enable_thinking is None, "Default enable_thinking should be None"
300-
301-
model.reset()
302-
messages = [{"role": "user", "content": [{"text": "Hi"}]}]
303-
async for _ in model.stream(messages):
304-
pass
305-
306-
# Should generate without error
307-
assert len(model.token_manager) > 0, "Should generate tokens"

0 commit comments

Comments
 (0)