Skip to content

Commit 2e84931

Browse files
committed
minor
1 parent cd1378e commit 2e84931

4 files changed

Lines changed: 45 additions & 27 deletions

File tree

documentation/examples/chatroom_tic_tac_toe.py

Lines changed: 4 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -115,6 +115,7 @@ def _check_game_state(self):
115115
)
116116
def run_tic_tac_toe(
117117
llm: kbench.LLMChat,
118+
judge_llm: kbench.LLMChat,
118119
) -> dict:
119120
"""Runs Tic-Tac-Toe using ChatRoom.
120121
@@ -126,7 +127,7 @@ def run_tic_tac_toe(
126127
After ChatRoom:
127128
- Game engine is an Actor that posts board state
128129
- Players see the full history (own moves as "assistant", peer as "user")
129-
- talk(schema=TicTacToeMove) returns structured output directly
130+
- reply(schema=TicTacToeMove) returns structured output directly
130131
"""
131132
game = TicTacToe()
132133
game_engine = actors.Actor(name="Game", role="user", avatar="🎮")
@@ -148,7 +149,7 @@ def run_tic_tac_toe(
148149
system_prompt="You are player X in Tic-Tac-Toe. When it's your turn, respond with your move (row and col, 0-indexed).",
149150
)
150151
player_o = room.add_participant(
151-
llm,
152+
judge_llm,
152153
name="PlayerO",
153154
avatar="⭕",
154155
system_prompt="You are player O in Tic-Tac-Toe. When it's your turn, respond with your move (row and col, 0-indexed).",
@@ -175,8 +176,6 @@ def run_tic_tac_toe(
175176

176177
# %%
177178

178-
model = kbench.llm
179-
run_tic_tac_toe.run(llm=model)
180-
179+
run_tic_tac_toe.run(llm=kbench.llm, judge_llm=kbench.judge_llm)
181180

182181
# %%

documentation/examples/chatroom_werewolf.py

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -14,15 +14,15 @@
1414

1515
"""Werewolf: A Benchmark for Social Deduction, Private Channels, and Deception
1616
17-
This example showcases a complete, 4-player game of Werewolf (also known as Mafia) using ChatRoom.
17+
This example showcases a complete, 7-player game of Werewolf (also known as Mafia) using ChatRoom.
1818
Werewolf is the gold standard of multi-agent social evaluation because it requires:
1919
1. Information Asymmetry: Roles are secret (Werewolves know their team; Villagers do not).
2020
2. Private Channels: Werewolves converse secretly at night to target a victim.
2121
3. Deception vs. Deduction: Werewolves must lie and blend in; Villagers must audit arguments.
2222
2323
We represent:
2424
- Alice and Bob as Werewolves (Secret Werewolf Team)
25-
- Charlie and David as Villagers (Secret Villager Team)
25+
- Charlie, David, Eve, Frank, and Grace as Villagers (Secret Villager Team)
2626
"""
2727

2828
# %%
@@ -144,7 +144,7 @@ def count_votes(vote_dict: dict) -> str | None:
144144
wolf_chat = room.private_channel(active_wolves, name="Werewolf Night Chat")
145145
victim = None
146146

147-
# Enter the private werewolf channel. Charlie and David are blind to this context.
147+
# Enter the private werewolf channel. The villagers are blind to this context.
148148
with wolf_chat:
149149
wolf_chat.post(
150150
"Werewolves, discuss and pick a Villager to eliminate tonight."

golden_tests/test_cookbook_examples.py

Lines changed: 18 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -34,7 +34,7 @@
3434

3535
# %%
3636
import base64
37-
import dataclasses as _dc
37+
import dataclasses
3838
import os
3939
import tempfile
4040
from contextlib import contextmanager
@@ -47,9 +47,13 @@
4747
from pydantic import BaseModel, Field
4848

4949
import kaggle_benchmarks as kbench
50-
from kaggle_benchmarks import actors as _actors
51-
from kaggle_benchmarks.chats import ChatRoom as _ChatRoom
52-
from kaggle_benchmarks.content_types import audios, images, videos
50+
from kaggle_benchmarks.actors import Actor
51+
from kaggle_benchmarks.chats import ChatRoom
52+
from kaggle_benchmarks.content_types import (
53+
audios,
54+
images,
55+
videos,
56+
)
5357

5458
# Models to be tested as the primary subject.
5559
TEST_LLM_NAMES = {
@@ -308,13 +312,13 @@ def test_extract_pydantic(llm):
308312
# --- Test Case: Structured Output (composite pydantic Extraction) ---
309313

310314

311-
class Actor(BaseModel):
315+
class FriendsActor(BaseModel):
312316
actor_name: str
313317
role_name: str
314318

315319

316320
class Casting(BaseModel):
317-
actors: list[Actor]
321+
actors: list[FriendsActor]
318322

319323

320324
# Known failures (genai): gpt-5.5 — MP sends empty json_schema.name.
@@ -1070,7 +1074,7 @@ def test_tool_with_schema_output(llm):
10701074
@kbench.task()
10711075
def test_chatroom_add_participant(llm):
10721076
"""Tests that the same LLM added twice yields independent participants."""
1073-
room = _ChatRoom(
1077+
room = ChatRoom(
10741078
system_prompt="A quick Q&A between two experts.",
10751079
name="Host",
10761080
)
@@ -1132,7 +1136,7 @@ def test_chatroom_add_participant(llm):
11321136
# within a ChatRoom context, combining multi-participant rooms with schema.
11331137

11341138

1135-
@_dc.dataclass(frozen=True)
1139+
@dataclasses.dataclass(frozen=True)
11361140
class _CityFact:
11371141
"""A structured fact about a city."""
11381142

@@ -1145,12 +1149,12 @@ class _CityFact:
11451149
@kbench.task()
11461150
def test_chatroom_talk_structured_output(llm):
11471151
"""Tests that reply(schema=) works inside a ChatRoom."""
1148-
room = _ChatRoom(
1152+
room = ChatRoom(
11491153
system_prompt="A geography quiz game.",
11501154
name="QuizMaster",
11511155
)
11521156

1153-
host = _actors.Actor(name="QuizMaster", role="user", avatar="🎯")
1157+
host = Actor(name="QuizMaster", role="user", avatar="🎯")
11541158
room.add_participant(host)
11551159
player = room.add_participant(
11561160
llm,
@@ -1190,7 +1194,7 @@ def test_chatroom_talk_structured_output(llm):
11901194
@kbench.task()
11911195
def test_chatroom_multi_turn(llm):
11921196
"""Tests multi-turn conversation: 2 rounds of moderator prompt → LLM reply."""
1193-
room = _ChatRoom(
1197+
room = ChatRoom(
11941198
system_prompt="A two-round trivia game.",
11951199
name="Trivia",
11961200
)
@@ -1241,7 +1245,7 @@ def test_chatroom_multi_turn(llm):
12411245
@kbench.task()
12421246
def test_chatroom_private_channel(llm):
12431247
"""Tests that private_channel messages are invisible to non-members."""
1244-
room = _ChatRoom(
1248+
room = ChatRoom(
12451249
system_prompt="A team coordination exercise with a secret planning phase.",
12461250
name="Coordinator",
12471251
)
@@ -1303,9 +1307,9 @@ def test_chatroom_private_channel(llm):
13031307
@kbench.task()
13041308
def test_chatroom_actor_talk(llm):
13051309
"""Tests that a non-LLM Actor can post messages that LLMs respond to."""
1306-
game = _actors.Actor(name="GameEngine", role="user", avatar="🎮")
1310+
game = Actor(name="GameEngine", role="user", avatar="🎮")
13071311

1308-
room = _ChatRoom(
1312+
room = ChatRoom(
13091313
system_prompt="A simple number guessing game. The GameEngine posts a number, the Player guesses.",
13101314
name="GameEngine",
13111315
)

src/kaggle_benchmarks/chats.py

Lines changed: 20 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -363,15 +363,30 @@ def __exit__(self, *exc):
363363
def post(
364364
self, message: str, visible_to: list["actors.Actor"] | None = None
365365
) -> Message:
366-
"""Post a narrator message to the room.
366+
"""Post a system/narrator directive to the room.
367367
368-
Uses the room's ``_narrator`` actor (whose name matches the room name)
369-
so the LLM sees consistent sender identity.
368+
Messages posted via ``post()`` come from the room's internal narrator
369+
— an actor that is **not** a registered participant. The roster
370+
explicitly tells LLMs to treat these messages as system instructions
371+
rather than peer speech, reducing the chance of LLMs "arguing back"
372+
at directives.
373+
374+
**When to use ``post()`` vs ``actor.say()``:**
375+
376+
- Use ``post()`` for structural directives that no specific character
377+
owns: phase transitions, rules, topic prompts.
378+
Example: ``room.post("--- Phase 2: Rebuttals ---")``
379+
380+
- Use ``actor.say()`` when a **named participant** is speaking and
381+
you want the message attributed to that identity in the transcript.
382+
Example: ``game_engine.say(f"Board:\\n{board}")``
370383
371384
Args:
372385
message: The message text.
373-
visible_to: Optional list of Actors who can see this message.
374-
If None, all participants can see it.
386+
visible_to: Optional list of participants who can see this message.
387+
If None (default), all participants can see it. Useful for
388+
sending private instructions to specific participants without
389+
creating a full ``private_channel()``.
375390
"""
376391
msg = Message(sender=self._narrator, content=message)
377392
if visible_to is not None:

0 commit comments

Comments
 (0)