|
34 | 34 |
|
35 | 35 | # %% |
36 | 36 | import base64 |
| 37 | +import dataclasses as _dc |
37 | 38 | import os |
38 | 39 | import tempfile |
39 | 40 | from contextlib import contextmanager |
|
46 | 47 | from pydantic import BaseModel, Field |
47 | 48 |
|
48 | 49 | import kaggle_benchmarks as kbench |
| 50 | +from kaggle_benchmarks import actors as _actors |
| 51 | +from kaggle_benchmarks.chats import ChatRoom as _ChatRoom |
49 | 52 | from kaggle_benchmarks.content_types import audios, images, videos |
50 | 53 |
|
51 | 54 | # Models to be tested as the primary subject. |
@@ -1047,3 +1050,288 @@ def test_tool_with_schema_output(llm): |
1047 | 1050 | result.population, |
1048 | 1051 | expectation="Population should be 3,700,000.", |
1049 | 1052 | ) |
| 1053 | + |
| 1054 | + |
| 1055 | +# %% |
| 1056 | +# --- Test Case: ChatRoom — add_participant + LLM Cloning --- |
| 1057 | +# Verifies that the same LLM object can be added as multiple participants via |
| 1058 | +# add_participant(), each receiving a distinct identity (name, system_prompt) |
| 1059 | +# and producing correct responses without role collision. |
| 1060 | + |
| 1061 | + |
| 1062 | +CHATROOM_LLM_NAMES = { |
| 1063 | + "google/gemini-2.5-flash", |
| 1064 | + "google/gemini-3-flash-preview", |
| 1065 | + "anthropic/claude-sonnet-4-6@default", |
| 1066 | +} |
| 1067 | + |
| 1068 | + |
| 1069 | +@benchmark_test(include=CHATROOM_LLM_NAMES) |
| 1070 | +@kbench.task() |
| 1071 | +def test_chatroom_add_participant(llm): |
| 1072 | + """Tests that the same LLM added twice yields independent participants.""" |
| 1073 | + room = _ChatRoom( |
| 1074 | + system_prompt="A quick Q&A between two experts.", |
| 1075 | + name="Host", |
| 1076 | + ) |
| 1077 | + |
| 1078 | + alice = room.add_participant( |
| 1079 | + llm, |
| 1080 | + name="Alice", |
| 1081 | + avatar="👩", |
| 1082 | + system_prompt="You are Alice, a Python expert. Always mention Python in your replies.", |
| 1083 | + ) |
| 1084 | + bob = room.add_participant( |
| 1085 | + llm, |
| 1086 | + name="Bob", |
| 1087 | + avatar="👨", |
| 1088 | + system_prompt="You are Bob, a Rust expert. Always mention Rust in your replies.", |
| 1089 | + ) |
| 1090 | + |
| 1091 | + with room: |
| 1092 | + room.post( |
| 1093 | + "Each expert, name your favorite programming language in one sentence." |
| 1094 | + ) |
| 1095 | + alice_reply = alice.talk() |
| 1096 | + bob_reply = bob.talk() |
| 1097 | + |
| 1098 | + # Clones must be distinct objects |
| 1099 | + kbench.assertions.assert_true( |
| 1100 | + alice is not bob, |
| 1101 | + "add_participant must return distinct objects for the same LLM.", |
| 1102 | + ) |
| 1103 | + |
| 1104 | + # Identity injection: each participant should follow their own system_prompt |
| 1105 | + kbench.assertions.assert_contains_regex( |
| 1106 | + r"(?i)python", |
| 1107 | + alice_reply, |
| 1108 | + expectation="Alice (Python expert) should mention Python.", |
| 1109 | + ) |
| 1110 | + kbench.assertions.assert_contains_regex( |
| 1111 | + r"(?i)rust", |
| 1112 | + bob_reply, |
| 1113 | + expectation="Bob (Rust expert) should mention Rust.", |
| 1114 | + ) |
| 1115 | + |
| 1116 | + # Transcript must attribute messages to the correct sender |
| 1117 | + kbench.assertions.assert_equal( |
| 1118 | + "Alice", |
| 1119 | + room.messages[1].sender.name, |
| 1120 | + expectation="Second message sender should be Alice.", |
| 1121 | + ) |
| 1122 | + kbench.assertions.assert_equal( |
| 1123 | + "Bob", |
| 1124 | + room.messages[2].sender.name, |
| 1125 | + expectation="Third message sender should be Bob.", |
| 1126 | + ) |
| 1127 | + |
| 1128 | + |
| 1129 | +# %% |
| 1130 | +# --- Test Case: ChatRoom — Structured Output via talk(schema=) --- |
| 1131 | +# Verifies that talk(schema=) returns structured output (dataclass) from |
| 1132 | +# within a ChatRoom context, combining multi-participant rooms with schema. |
| 1133 | + |
| 1134 | + |
| 1135 | +@_dc.dataclass(frozen=True) |
| 1136 | +class _CityFact: |
| 1137 | + """A structured fact about a city.""" |
| 1138 | + |
| 1139 | + city: str |
| 1140 | + country: str |
| 1141 | + population_millions: float |
| 1142 | + |
| 1143 | + |
| 1144 | +@benchmark_test(include=CHATROOM_LLM_NAMES) |
| 1145 | +@kbench.task() |
| 1146 | +def test_chatroom_talk_structured_output(llm): |
| 1147 | + """Tests that talk(schema=) works inside a ChatRoom.""" |
| 1148 | + room = _ChatRoom( |
| 1149 | + system_prompt="A geography quiz game.", |
| 1150 | + name="QuizMaster", |
| 1151 | + ) |
| 1152 | + |
| 1153 | + host = _actors.Actor(name="QuizMaster", role="user", avatar="🎯") |
| 1154 | + room.add_participant(host) |
| 1155 | + player = room.add_participant( |
| 1156 | + llm, |
| 1157 | + name="Player", |
| 1158 | + system_prompt="You are a geography expert. Answer questions accurately.", |
| 1159 | + ) |
| 1160 | + |
| 1161 | + with room: |
| 1162 | + host.talk( |
| 1163 | + "What is the capital of France? Provide city, country, and approximate population in millions." |
| 1164 | + ) |
| 1165 | + fact = player.talk(schema=_CityFact) |
| 1166 | + |
| 1167 | + kbench.assertions.assert_contains_regex( |
| 1168 | + r"(?i)paris", |
| 1169 | + fact.city, |
| 1170 | + expectation="City should be Paris.", |
| 1171 | + ) |
| 1172 | + kbench.assertions.assert_contains_regex( |
| 1173 | + r"(?i)france", |
| 1174 | + fact.country, |
| 1175 | + expectation="Country should be France.", |
| 1176 | + ) |
| 1177 | + kbench.assertions.assert_true( |
| 1178 | + 0.5 < fact.population_millions < 15.0, |
| 1179 | + f"Population should be reasonable, got {fact.population_millions}M.", |
| 1180 | + ) |
| 1181 | + |
| 1182 | + |
| 1183 | +# %% |
| 1184 | +# --- Test Case: ChatRoom — Multi-Turn Conversation --- |
| 1185 | +# Verifies that room.post() and talk() produce correct multi-turn histories |
| 1186 | +# and that room.messages captures the full transcript after exit. |
| 1187 | + |
| 1188 | + |
| 1189 | +@benchmark_test(include=CHATROOM_LLM_NAMES) |
| 1190 | +@kbench.task() |
| 1191 | +def test_chatroom_multi_turn(llm): |
| 1192 | + """Tests multi-turn conversation: 2 rounds of moderator prompt → LLM reply.""" |
| 1193 | + room = _ChatRoom( |
| 1194 | + system_prompt="A two-round trivia game.", |
| 1195 | + name="Trivia", |
| 1196 | + ) |
| 1197 | + |
| 1198 | + player = room.add_participant( |
| 1199 | + llm, |
| 1200 | + name="Player", |
| 1201 | + system_prompt="You are a trivia contestant. Answer each question in one concise sentence.", |
| 1202 | + ) |
| 1203 | + |
| 1204 | + with room: |
| 1205 | + # Round 1 |
| 1206 | + room.post("Round 1: What is the chemical symbol for gold?") |
| 1207 | + r1 = player.talk() |
| 1208 | + |
| 1209 | + # Round 2 |
| 1210 | + room.post("Round 2: What is the chemical symbol for silver?") |
| 1211 | + r2 = player.talk() |
| 1212 | + |
| 1213 | + # Transcript must contain all messages (2 posts + 2 replies = 4) |
| 1214 | + kbench.assertions.assert_equal( |
| 1215 | + 4, |
| 1216 | + len(room.messages), |
| 1217 | + expectation="Room should have 4 messages (2 posts + 2 replies).", |
| 1218 | + ) |
| 1219 | + |
| 1220 | + # Content verification |
| 1221 | + kbench.assertions.assert_contains_regex( |
| 1222 | + r"(?i)au", |
| 1223 | + r1, |
| 1224 | + expectation="Answer should contain 'Au' for gold.", |
| 1225 | + ) |
| 1226 | + kbench.assertions.assert_contains_regex( |
| 1227 | + r"(?i)ag", |
| 1228 | + r2, |
| 1229 | + expectation="Answer should contain 'Ag' for silver.", |
| 1230 | + ) |
| 1231 | + |
| 1232 | + |
| 1233 | +# %% |
| 1234 | +# --- Test Case: ChatRoom — Private Channel Isolation --- |
| 1235 | +# Verifies that private_channel() messages are only visible to members |
| 1236 | +# and invisible to non-members. This is the core information-asymmetry |
| 1237 | +# primitive used by Werewolf and similar social deduction benchmarks. |
| 1238 | + |
| 1239 | + |
| 1240 | +@benchmark_test(include=CHATROOM_LLM_NAMES) |
| 1241 | +@kbench.task() |
| 1242 | +def test_chatroom_private_channel(llm): |
| 1243 | + """Tests that private_channel messages are invisible to non-members.""" |
| 1244 | + room = _ChatRoom( |
| 1245 | + system_prompt="A team coordination exercise with a secret planning phase.", |
| 1246 | + name="Coordinator", |
| 1247 | + ) |
| 1248 | + |
| 1249 | + alice = room.add_participant( |
| 1250 | + llm, |
| 1251 | + name="Alice", |
| 1252 | + avatar="👩", |
| 1253 | + system_prompt=( |
| 1254 | + "You are Alice. In the secret channel, always mention the codeword 'BLUEPRINT'. " |
| 1255 | + "In the public channel, never mention the codeword." |
| 1256 | + ), |
| 1257 | + ) |
| 1258 | + bob = room.add_participant( |
| 1259 | + llm, |
| 1260 | + name="Bob", |
| 1261 | + avatar="👨", |
| 1262 | + system_prompt="You are Bob. You do not know any secret codewords. Report what you know.", |
| 1263 | + ) |
| 1264 | + |
| 1265 | + with room: |
| 1266 | + room.post("Public phase: everyone introduces themselves briefly.") |
| 1267 | + alice.talk() |
| 1268 | + bob.talk() |
| 1269 | + |
| 1270 | + # Private channel: only Alice is a member |
| 1271 | + secret = room.private_channel([alice], name="Secret Planning") |
| 1272 | + with secret: |
| 1273 | + secret.post("Alice, share your secret plan and mention the codeword.") |
| 1274 | + secret_reply = alice.talk() |
| 1275 | + |
| 1276 | + # Back in public: ask Bob to summarize what he knows |
| 1277 | + room.post( |
| 1278 | + "Bob, summarize everything you've heard so far. Mention any codewords if you heard any." |
| 1279 | + ) |
| 1280 | + bob_summary = bob.talk() |
| 1281 | + |
| 1282 | + # Alice's secret reply should contain the codeword |
| 1283 | + kbench.assertions.assert_contains_regex( |
| 1284 | + r"(?i)blueprint", |
| 1285 | + secret_reply, |
| 1286 | + expectation="Alice's private message should contain the codeword 'BLUEPRINT'.", |
| 1287 | + ) |
| 1288 | + |
| 1289 | + # Bob's summary should NOT contain the codeword (he never saw it) |
| 1290 | + kbench.assertions.assert_true( |
| 1291 | + "blueprint" not in bob_summary.lower(), |
| 1292 | + f"Bob should NOT know the codeword, but his summary was: '{bob_summary[:200]}'", |
| 1293 | + ) |
| 1294 | + |
| 1295 | + |
| 1296 | +# %% |
| 1297 | +# --- Test Case: ChatRoom — Actor (Non-LLM Participant) --- |
| 1298 | +# Verifies that Actor.talk() posts scripted messages visible to LLM |
| 1299 | +# participants, and that LLMs can respond to Actor messages correctly. |
| 1300 | + |
| 1301 | + |
| 1302 | +@benchmark_test(include=CHATROOM_LLM_NAMES) |
| 1303 | +@kbench.task() |
| 1304 | +def test_chatroom_actor_talk(llm): |
| 1305 | + """Tests that a non-LLM Actor can post messages that LLMs respond to.""" |
| 1306 | + game = _actors.Actor(name="GameEngine", role="user", avatar="🎮") |
| 1307 | + |
| 1308 | + room = _ChatRoom( |
| 1309 | + system_prompt="A simple number guessing game. The GameEngine posts a number, the Player guesses.", |
| 1310 | + name="GameEngine", |
| 1311 | + ) |
| 1312 | + |
| 1313 | + room.add_participant(game) |
| 1314 | + player = room.add_participant( |
| 1315 | + llm, |
| 1316 | + name="Player", |
| 1317 | + system_prompt=( |
| 1318 | + "You are a player in a number game. When told a number, " |
| 1319 | + "respond with that number plus one. Reply with ONLY the number." |
| 1320 | + ), |
| 1321 | + ) |
| 1322 | + |
| 1323 | + with room: |
| 1324 | + game.talk("The number is: 41") |
| 1325 | + reply = player.talk() |
| 1326 | + |
| 1327 | + kbench.assertions.assert_contains_regex( |
| 1328 | + r"42", |
| 1329 | + reply, |
| 1330 | + expectation="Player should respond with 42 (41 + 1).", |
| 1331 | + ) |
| 1332 | + |
| 1333 | + # Actor message is in transcript |
| 1334 | + kbench.assertions.assert_true( |
| 1335 | + room.messages[0].sender.name == "GameEngine", |
| 1336 | + "First message should be from the GameEngine Actor.", |
| 1337 | + ) |
0 commit comments