Skip to content

Commit e6e2fa1

Browse files
committed
add golden tests
1 parent 73ea0ed commit e6e2fa1

2 files changed

Lines changed: 296 additions & 0 deletions

File tree

golden_tests/test_cookbook_examples.py

Lines changed: 288 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -34,6 +34,7 @@
3434

3535
# %%
3636
import base64
37+
import dataclasses as _dc
3738
import os
3839
import tempfile
3940
from contextlib import contextmanager
@@ -46,6 +47,8 @@
4647
from pydantic import BaseModel, Field
4748

4849
import kaggle_benchmarks as kbench
50+
from kaggle_benchmarks import actors as _actors
51+
from kaggle_benchmarks.chats import ChatRoom as _ChatRoom
4952
from kaggle_benchmarks.content_types import audios, images, videos
5053

5154
# Models to be tested as the primary subject.
@@ -1047,3 +1050,288 @@ def test_tool_with_schema_output(llm):
10471050
result.population,
10481051
expectation="Population should be 3,700,000.",
10491052
)
1053+
1054+
1055+
# %%
1056+
# --- Test Case: ChatRoom — add_participant + LLM Cloning ---
1057+
# Verifies that the same LLM object can be added as multiple participants via
1058+
# add_participant(), each receiving a distinct identity (name, system_prompt)
1059+
# and producing correct responses without role collision.
1060+
1061+
1062+
CHATROOM_LLM_NAMES = {
1063+
"google/gemini-2.5-flash",
1064+
"google/gemini-3-flash-preview",
1065+
"anthropic/claude-sonnet-4-6@default",
1066+
}
1067+
1068+
1069+
@benchmark_test(include=CHATROOM_LLM_NAMES)
1070+
@kbench.task()
1071+
def test_chatroom_add_participant(llm):
1072+
"""Tests that the same LLM added twice yields independent participants."""
1073+
room = _ChatRoom(
1074+
system_prompt="A quick Q&A between two experts.",
1075+
name="Host",
1076+
)
1077+
1078+
alice = room.add_participant(
1079+
llm,
1080+
name="Alice",
1081+
avatar="👩",
1082+
system_prompt="You are Alice, a Python expert. Always mention Python in your replies.",
1083+
)
1084+
bob = room.add_participant(
1085+
llm,
1086+
name="Bob",
1087+
avatar="👨",
1088+
system_prompt="You are Bob, a Rust expert. Always mention Rust in your replies.",
1089+
)
1090+
1091+
with room:
1092+
room.post(
1093+
"Each expert, name your favorite programming language in one sentence."
1094+
)
1095+
alice_reply = alice.talk()
1096+
bob_reply = bob.talk()
1097+
1098+
# Clones must be distinct objects
1099+
kbench.assertions.assert_true(
1100+
alice is not bob,
1101+
"add_participant must return distinct objects for the same LLM.",
1102+
)
1103+
1104+
# Identity injection: each participant should follow their own system_prompt
1105+
kbench.assertions.assert_contains_regex(
1106+
r"(?i)python",
1107+
alice_reply,
1108+
expectation="Alice (Python expert) should mention Python.",
1109+
)
1110+
kbench.assertions.assert_contains_regex(
1111+
r"(?i)rust",
1112+
bob_reply,
1113+
expectation="Bob (Rust expert) should mention Rust.",
1114+
)
1115+
1116+
# Transcript must attribute messages to the correct sender
1117+
kbench.assertions.assert_equal(
1118+
"Alice",
1119+
room.messages[1].sender.name,
1120+
expectation="Second message sender should be Alice.",
1121+
)
1122+
kbench.assertions.assert_equal(
1123+
"Bob",
1124+
room.messages[2].sender.name,
1125+
expectation="Third message sender should be Bob.",
1126+
)
1127+
1128+
1129+
# %%
1130+
# --- Test Case: ChatRoom — Structured Output via talk(schema=) ---
1131+
# Verifies that talk(schema=) returns structured output (dataclass) from
1132+
# within a ChatRoom context, combining multi-participant rooms with schema.
1133+
1134+
1135+
@_dc.dataclass(frozen=True)
1136+
class _CityFact:
1137+
"""A structured fact about a city."""
1138+
1139+
city: str
1140+
country: str
1141+
population_millions: float
1142+
1143+
1144+
@benchmark_test(include=CHATROOM_LLM_NAMES)
1145+
@kbench.task()
1146+
def test_chatroom_talk_structured_output(llm):
1147+
"""Tests that talk(schema=) works inside a ChatRoom."""
1148+
room = _ChatRoom(
1149+
system_prompt="A geography quiz game.",
1150+
name="QuizMaster",
1151+
)
1152+
1153+
host = _actors.Actor(name="QuizMaster", role="user", avatar="🎯")
1154+
room.add_participant(host)
1155+
player = room.add_participant(
1156+
llm,
1157+
name="Player",
1158+
system_prompt="You are a geography expert. Answer questions accurately.",
1159+
)
1160+
1161+
with room:
1162+
host.talk(
1163+
"What is the capital of France? Provide city, country, and approximate population in millions."
1164+
)
1165+
fact = player.talk(schema=_CityFact)
1166+
1167+
kbench.assertions.assert_contains_regex(
1168+
r"(?i)paris",
1169+
fact.city,
1170+
expectation="City should be Paris.",
1171+
)
1172+
kbench.assertions.assert_contains_regex(
1173+
r"(?i)france",
1174+
fact.country,
1175+
expectation="Country should be France.",
1176+
)
1177+
kbench.assertions.assert_true(
1178+
0.5 < fact.population_millions < 15.0,
1179+
f"Population should be reasonable, got {fact.population_millions}M.",
1180+
)
1181+
1182+
1183+
# %%
1184+
# --- Test Case: ChatRoom — Multi-Turn Conversation ---
1185+
# Verifies that room.post() and talk() produce correct multi-turn histories
1186+
# and that room.messages captures the full transcript after exit.
1187+
1188+
1189+
@benchmark_test(include=CHATROOM_LLM_NAMES)
1190+
@kbench.task()
1191+
def test_chatroom_multi_turn(llm):
1192+
"""Tests multi-turn conversation: 2 rounds of moderator prompt → LLM reply."""
1193+
room = _ChatRoom(
1194+
system_prompt="A two-round trivia game.",
1195+
name="Trivia",
1196+
)
1197+
1198+
player = room.add_participant(
1199+
llm,
1200+
name="Player",
1201+
system_prompt="You are a trivia contestant. Answer each question in one concise sentence.",
1202+
)
1203+
1204+
with room:
1205+
# Round 1
1206+
room.post("Round 1: What is the chemical symbol for gold?")
1207+
r1 = player.talk()
1208+
1209+
# Round 2
1210+
room.post("Round 2: What is the chemical symbol for silver?")
1211+
r2 = player.talk()
1212+
1213+
# Transcript must contain all messages (2 posts + 2 replies = 4)
1214+
kbench.assertions.assert_equal(
1215+
4,
1216+
len(room.messages),
1217+
expectation="Room should have 4 messages (2 posts + 2 replies).",
1218+
)
1219+
1220+
# Content verification
1221+
kbench.assertions.assert_contains_regex(
1222+
r"(?i)au",
1223+
r1,
1224+
expectation="Answer should contain 'Au' for gold.",
1225+
)
1226+
kbench.assertions.assert_contains_regex(
1227+
r"(?i)ag",
1228+
r2,
1229+
expectation="Answer should contain 'Ag' for silver.",
1230+
)
1231+
1232+
1233+
# %%
1234+
# --- Test Case: ChatRoom — Private Channel Isolation ---
1235+
# Verifies that private_channel() messages are only visible to members
1236+
# and invisible to non-members. This is the core information-asymmetry
1237+
# primitive used by Werewolf and similar social deduction benchmarks.
1238+
1239+
1240+
@benchmark_test(include=CHATROOM_LLM_NAMES)
1241+
@kbench.task()
1242+
def test_chatroom_private_channel(llm):
1243+
"""Tests that private_channel messages are invisible to non-members."""
1244+
room = _ChatRoom(
1245+
system_prompt="A team coordination exercise with a secret planning phase.",
1246+
name="Coordinator",
1247+
)
1248+
1249+
alice = room.add_participant(
1250+
llm,
1251+
name="Alice",
1252+
avatar="👩",
1253+
system_prompt=(
1254+
"You are Alice. In the secret channel, always mention the codeword 'BLUEPRINT'. "
1255+
"In the public channel, never mention the codeword."
1256+
),
1257+
)
1258+
bob = room.add_participant(
1259+
llm,
1260+
name="Bob",
1261+
avatar="👨",
1262+
system_prompt="You are Bob. You do not know any secret codewords. Report what you know.",
1263+
)
1264+
1265+
with room:
1266+
room.post("Public phase: everyone introduces themselves briefly.")
1267+
alice.talk()
1268+
bob.talk()
1269+
1270+
# Private channel: only Alice is a member
1271+
secret = room.private_channel([alice], name="Secret Planning")
1272+
with secret:
1273+
secret.post("Alice, share your secret plan and mention the codeword.")
1274+
secret_reply = alice.talk()
1275+
1276+
# Back in public: ask Bob to summarize what he knows
1277+
room.post(
1278+
"Bob, summarize everything you've heard so far. Mention any codewords if you heard any."
1279+
)
1280+
bob_summary = bob.talk()
1281+
1282+
# Alice's secret reply should contain the codeword
1283+
kbench.assertions.assert_contains_regex(
1284+
r"(?i)blueprint",
1285+
secret_reply,
1286+
expectation="Alice's private message should contain the codeword 'BLUEPRINT'.",
1287+
)
1288+
1289+
# Bob's summary should NOT contain the codeword (he never saw it)
1290+
kbench.assertions.assert_true(
1291+
"blueprint" not in bob_summary.lower(),
1292+
f"Bob should NOT know the codeword, but his summary was: '{bob_summary[:200]}'",
1293+
)
1294+
1295+
1296+
# %%
1297+
# --- Test Case: ChatRoom — Actor (Non-LLM Participant) ---
1298+
# Verifies that Actor.talk() posts scripted messages visible to LLM
1299+
# participants, and that LLMs can respond to Actor messages correctly.
1300+
1301+
1302+
@benchmark_test(include=CHATROOM_LLM_NAMES)
1303+
@kbench.task()
1304+
def test_chatroom_actor_talk(llm):
1305+
"""Tests that a non-LLM Actor can post messages that LLMs respond to."""
1306+
game = _actors.Actor(name="GameEngine", role="user", avatar="🎮")
1307+
1308+
room = _ChatRoom(
1309+
system_prompt="A simple number guessing game. The GameEngine posts a number, the Player guesses.",
1310+
name="GameEngine",
1311+
)
1312+
1313+
room.add_participant(game)
1314+
player = room.add_participant(
1315+
llm,
1316+
name="Player",
1317+
system_prompt=(
1318+
"You are a player in a number game. When told a number, "
1319+
"respond with that number plus one. Reply with ONLY the number."
1320+
),
1321+
)
1322+
1323+
with room:
1324+
game.talk("The number is: 41")
1325+
reply = player.talk()
1326+
1327+
kbench.assertions.assert_contains_regex(
1328+
r"42",
1329+
reply,
1330+
expectation="Player should respond with 42 (41 + 1).",
1331+
)
1332+
1333+
# Actor message is in transcript
1334+
kbench.assertions.assert_true(
1335+
room.messages[0].sender.name == "GameEngine",
1336+
"First message should be from the GameEngine Actor.",
1337+
)

src/kaggle_benchmarks/chats.py

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -398,6 +398,14 @@ def _build_roster(self, viewer: "actors.Actor") -> str:
398398
)
399399
lines.append("Your messages appear without a prefix.")
400400
lines.append("")
401+
lines.append(
402+
"Messages tagged with '(private: Channel Name)' are from a"
403+
" private channel visible only to its members. Other participants"
404+
" cannot see these messages. When you see a private channel"
405+
" prompt, respond to it directly — do not continue the public"
406+
" conversation."
407+
)
408+
lines.append("")
401409
lines.append(
402410
f'Note on "{self.name}": Messages from "{self.name}" are'
403411
" system/narrator instructions, not from another player."

0 commit comments

Comments
 (0)