Skip to content

Commit e1d4021

Browse files
committed
fix: improve chat accuracy and keyword fallback
- Add date/portal extraction before keyword classification in fallback - Partial portal name matching (e.g. 'Löwenbräu' → 'Löwenbräu-Festzelt') - Include actual dates in _compress_availability and MCP tool results - Add check_date param for on-demand deep-scanning in monitor_availability - Rewrite SCANNER_INSTRUCTIONS with accuracy and action rules - Lower LLM temperature to 0.2 to reduce hallucination - Show accumulated thinking steps in chat UI - Add eval_chat_accuracy.py test suite (6/6 passing)
1 parent 13f90e6 commit e1d4021

5 files changed

Lines changed: 519 additions & 33 deletions

File tree

src/wiesn_agent/api.py

Lines changed: 145 additions & 17 deletions
Original file line numberDiff line numberDiff line change
@@ -51,6 +51,9 @@
5151
# Stores both user and agent messages for the chat panel.
5252
_chat_log: deque[dict] = deque(maxlen=200)
5353

54+
# ── Thinking status (broadcast to SSE clients) ───
55+
_thinking_status: str = ""
56+
5457

5558
# ── Persistence helpers ───────────────────────────
5659

@@ -464,6 +467,70 @@ class ChatMessage(BaseModel):
464467
}
465468

466469

470+
_MONTH_NAMES = {
471+
"januar": "01", "februar": "02", "märz": "03", "april": "04",
472+
"mai": "05", "juni": "06", "juli": "07", "august": "08",
473+
"september": "09", "oktober": "10", "november": "11", "dezember": "12",
474+
}
475+
476+
477+
def _extract_date(text: str) -> str:
478+
"""Extract a date from user text. Returns YYYY-MM-DD or DD.MM.YYYY or empty string."""
479+
# Match DD.MM or DD.MM.YYYY
480+
m = re.search(r'(\d{1,2})\.(\d{1,2})(?:\.(\d{4}))?', text)
481+
if m:
482+
day = int(m.group(1))
483+
month = int(m.group(2))
484+
year = int(m.group(3)) if m.group(3) else 2026
485+
return f"{year}-{month:02d}-{day:02d}"
486+
487+
# Match "DD. Monat" or "DD Monat"
488+
m = re.search(r'(\d{1,2})\.?\s+(januar|februar|märz|april|mai|juni|juli|august|september|oktober|november|dezember)', text.lower())
489+
if m:
490+
day = int(m.group(1))
491+
month = int(_MONTH_NAMES[m.group(2)])
492+
return f"2026-{month:02d}-{day:02d}"
493+
494+
return ""
495+
496+
497+
def _date_matches(iso_date: str, text: str) -> bool:
498+
"""Check if an ISO date (YYYY-MM-DD) matches a date in text (various formats)."""
499+
try:
500+
from datetime import datetime as dt
501+
parsed = dt.strptime(iso_date, "%Y-%m-%d")
502+
# DD.MM.YYYY format
503+
if parsed.strftime("%d.%m.%Y") in text:
504+
return True
505+
# German month name
506+
months_de = ["Januar", "Februar", "März", "April", "Mai", "Juni",
507+
"Juli", "August", "September", "Oktober", "November", "Dezember"]
508+
german = f"{parsed.day}. {months_de[parsed.month - 1]}"
509+
if german in text:
510+
return True
511+
except (ValueError, IndexError):
512+
pass
513+
return False
514+
515+
516+
def _find_portal(text: str, config) -> str | None:
517+
"""Find a portal name mentioned in text, with partial matching."""
518+
lower = text.lower()
519+
# Exact match first
520+
for portal in config.portale:
521+
if portal.name.lower() in lower:
522+
return portal.name
523+
# Partial match: split portal name on spaces/hyphens and check core parts
524+
for portal in config.portale:
525+
parts = re.split(r'[\s\-]+', portal.name.lower())
526+
# Match if any distinctive part (>3 chars, not generic) appears
527+
generic = {"fest", "zelt", "festzelt", "wiesn"}
528+
for part in parts:
529+
if len(part) > 3 and part not in generic and part in lower:
530+
return portal.name
531+
return None
532+
533+
467534
def _classify_intent(text: str) -> str:
468535
"""Classify user message into an intent. Returns intent name or 'unknown'."""
469536
lower = text.lower().strip()
@@ -534,14 +601,11 @@ async def post_chat(body: ChatMessage):
534601
}
535602

536603
def _on_tool_progress(tool_name: str, tool_args: dict) -> None:
604+
global _thinking_status
537605
portal = tool_args.get("portal_name") or tool_args.get("name") or ""
538606
label = _TOOL_LABELS.get(tool_name, tool_name)
539607
detail = f" — {portal}" if portal else ""
540-
_chat_log.append({
541-
"timestamp": datetime.now().isoformat(),
542-
"role": "thinking",
543-
"message": f"{label}{detail}",
544-
})
608+
_thinking_status = f"{label}{detail}"
545609

546610
history = list(_chat_log)[:-1] # exclude current message (already in prompt)
547611
reply_text = await llm_chat(
@@ -550,9 +614,8 @@ def _on_tool_progress(tool_name: str, tool_args: dict) -> None:
550614
on_progress=_on_tool_progress,
551615
)
552616

553-
# Remove thinking entries before adding final reply
554-
while _chat_log and _chat_log[-1].get("role") == "thinking":
555-
_chat_log.pop()
617+
# Clear thinking status
618+
_thinking_status = ""
556619

557620
reply = _chat_reply(reply_text)
558621
return {"user": user_entry, "reply": reply}
@@ -563,18 +626,78 @@ def _on_tool_progress(tool_name: str, tool_args: dict) -> None:
563626
logger.warning("LLM chat error, falling back to keywords: %s", e, exc_info=True)
564627

565628
# ── Keyword fallback ──────────────────────────
629+
# First, check for date/portal mentions (more specific than keyword intents)
630+
snapshots = load_snapshots()
631+
config = _load_config()
632+
mentioned_date = _extract_date(text)
633+
mentioned_portal = _find_portal(text, config)
634+
635+
if mentioned_portal and mentioned_date:
636+
snap = snapshots.get(mentioned_portal)
637+
has_date = False
638+
if snap:
639+
for d in snap.datum_options:
640+
val = d.get("value", d.get("text", ""))
641+
txt = d.get("text", d.get("value", ""))
642+
if mentioned_date in val or mentioned_date in txt or _date_matches(mentioned_date, txt):
643+
has_date = True
644+
break
645+
if has_date:
646+
reply = _chat_reply(
647+
f"**{mentioned_portal}** hat den **{mentioned_date}** als auswählbares Datum. "
648+
f"Abend-Slots sind nicht bestätigt (dafür ist ein Deep-Scan nötig)."
649+
)
650+
else:
651+
reply = _chat_reply(f"**{mentioned_portal}** hat den **{mentioned_date}** leider **nicht** verfügbar.")
652+
return {"user": user_entry, "reply": reply}
653+
654+
if mentioned_date:
655+
with_date = []
656+
without_date = []
657+
for name, snap in snapshots.items():
658+
found = False
659+
for d in snap.datum_options:
660+
val = d.get("value", d.get("text", ""))
661+
txt = d.get("text", d.get("value", ""))
662+
if mentioned_date in val or mentioned_date in txt or _date_matches(mentioned_date, txt):
663+
found = True
664+
break
665+
if found:
666+
with_date.append(name)
667+
else:
668+
without_date.append(name)
669+
670+
if with_date:
671+
reply = _chat_reply(
672+
f"**{len(with_date)}** Zelte haben den **{mentioned_date}** als auswählbares Datum: "
673+
f"{', '.join(with_date)}.\n\n"
674+
f"**{len(without_date)}** Zelte haben diesen Tag nicht."
675+
)
676+
else:
677+
reply = _chat_reply(f"Kein Zelt hat den **{mentioned_date}** verfügbar.")
678+
return {"user": user_entry, "reply": reply}
679+
680+
if mentioned_portal:
681+
snap = snapshots.get(mentioned_portal)
682+
if snap and snap.datum_options:
683+
dates = [d.get("text", d.get("value", "")) for d in snap.datum_options]
684+
reply = _chat_reply(
685+
f"**{mentioned_portal}** hat **{len(dates)}** auswählbare Termine:\n"
686+
+ ", ".join(dates)
687+
)
688+
elif snap:
689+
reply = _chat_reply(f"**{mentioned_portal}** hat aktuell **keine** verfügbaren Termine.")
690+
else:
691+
reply = _chat_reply(f"**{mentioned_portal}** wurde noch nicht gescannt.")
692+
return {"user": user_entry, "reply": reply}
693+
694+
# Fall back to keyword classification for generic intents
566695
intent = _classify_intent(text)
567696

568697
# ── Intent: Scan ──────────────────────────────
569698
if intent == "scan":
570699
config = _load_config()
571-
# Check if a specific portal is mentioned
572-
target_portal = None
573-
lower = text.lower()
574-
for portal in config.portale:
575-
if portal.name.lower() in lower:
576-
target_portal = portal.name
577-
break
700+
target_portal = _find_portal(text, config)
578701

579702
if target_portal:
580703
reply = _chat_reply(f"Starting scan for **{target_portal}**...")
@@ -652,7 +775,7 @@ def _on_tool_progress(tool_name: str, tool_args: dict) -> None:
652775
reply = _chat_reply("Portals:\n" + "\n".join(lines))
653776
return {"user": user_entry, "reply": reply}
654777

655-
# ── Default: unrecognized → show help ─────────
778+
# ── Truly unrecognized → show help ──
656779
reply = _chat_reply(
657780
"I didn't quite catch that. Here's what I can help with:\n"
658781
"- **scan** — Start scanning portals\n"
@@ -692,10 +815,15 @@ async def event_generator():
692815
# Use a snapshot of the current deque to avoid index race conditions
693816
# when items are evicted from the maxlen ring buffer.
694817
last_seen = len(list(_chat_log))
818+
prev_thinking = ""
695819
yield f"data: {json.dumps({'type': 'connected', 'count': last_seen})}\n\n"
696820
while True:
697821
if await request.is_disconnected():
698822
break
823+
# Broadcast thinking status changes immediately
824+
if _thinking_status != prev_thinking:
825+
prev_thinking = _thinking_status
826+
yield f"data: {json.dumps({'role': 'thinking', 'message': _thinking_status})}\n\n"
699827
snapshot = list(_chat_log)
700828
if len(snapshot) > last_seen:
701829
for item in snapshot[last_seen:]:
@@ -704,7 +832,7 @@ async def event_generator():
704832
elif len(snapshot) < last_seen:
705833
# Buffer wrapped — reset
706834
last_seen = len(snapshot)
707-
await asyncio.sleep(0.5)
835+
await asyncio.sleep(0.3)
708836

709837
return StreamingResponse(event_generator(), media_type="text/event-stream")
710838

src/wiesn_agent/chat_agent.py

Lines changed: 58 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -77,14 +77,35 @@
7777
{config}
7878
7979
## Your Tools
80-
- `monitor_availability(portal_name)` — scan date dropdowns, compare with saved \
80+
- `monitor_availability(portal_name, check_date)` — scan date dropdowns, compare with saved \
8181
snapshots, deep-scan time slots.
8282
- Use `portal_name="all"` for ALL portals (preferred!).
8383
- Use a specific name (e.g. `"Hacker-Festzelt"`) for one portal.
8484
- **NEVER** call separately for each portal — use "all" instead!
85+
- Use `check_date="2026-09-25"` to deep-scan time slots for a specific date.\
86+
**When the user asks about a specific date, ALWAYS use check_date!**
8587
- `check_portal(name)` — navigate to one portal and get page info.
8688
- `check_all_portals()` — quick check of all portals.
8789
90+
## Accuracy Rules (CRITICAL — HIGHEST PRIORITY)
91+
1. **NEVER guess or assume availability.** Only state what the tool result \
92+
explicitly contains.
93+
2. The tool result lists EXACT dates per portal. If the user asks about a \
94+
specific date (e.g. 25.9), look for exactly that date in the tool output. \
95+
If it's NOT listed → that portal does NOT have it. Say so.
96+
3. **Evening/abends slots** are only confirmed when the tool result explicitly \
97+
lists `abend_slots` for that date. A date in `datum_options` does NOT mean \
98+
evening slots exist — it only means the date dropdown contains it.
99+
4. If unsure, say "not confirmed" — never say "available" without proof.
100+
5. **Accuracy is the core product value. Wrong data = broken trust.**
101+
102+
## Action Rules
103+
1. When the user asks about availability, **immediately call the tool** — \
104+
do NOT ask "shall I check?" or "want me to look?".
105+
2. Always answer with specific data. **NEVER end your reply with a question** \
106+
like "Soll ich prüfen?" or "Möchtest du...?". Just give the answer.
107+
3. Only ask before WRITING/SUBMITTING — never before reading/checking.
108+
88109
## Background Monitoring
89110
A background scanner runs automatically every few minutes. \
90111
When the user asks "Status" or "Übersicht", you can answer from your knowledge — \
@@ -244,7 +265,11 @@ def _classify(self, text: str) -> str:
244265
# ── Availability compression ─────────────────────
245266

246267
def _compress_availability(raw: str) -> str:
247-
"""Compress monitor_availability JSON into a pre-formatted summary."""
268+
"""Compress monitor_availability JSON into a pre-formatted summary.
269+
270+
Includes actual date values so the LLM can accurately answer
271+
date-specific queries without guessing.
272+
"""
248273
import json
249274

250275
try:
@@ -260,16 +285,21 @@ def _compress_availability(raw: str) -> str:
260285
without_dates: list[str] = []
261286
with_new: list[str] = []
262287
errors: list[str] = []
288+
# Collect per-portal date details for accuracy
289+
date_details: list[str] = []
263290

264291
for r in results:
265292
name = r.get("portal", "?")
266293
count = r.get("datum_count", 0)
267294
new = r.get("new_dates", [])
295+
dates = r.get("dates", []) # actual date values
268296

269297
if r.get("error"):
270298
errors.append(name)
271299
elif count > 0:
272300
with_dates.append(f"{name} ({count})")
301+
if dates:
302+
date_details.append(f"{name}: {', '.join(dates)}")
273303
if new and not r.get("is_first_scan"):
274304
with_new.append(f"{name}: +{len(new)} new")
275305
else:
@@ -279,6 +309,10 @@ def _compress_availability(raw: str) -> str:
279309

280310
lines = [
281311
"RELAY THIS SUMMARY TO THE USER (translate to their language, keep it compact):",
312+
"ACCURACY RULE: A date in 'Available dates per portal' means the date is SELECTABLE in the dropdown.",
313+
"It does NOT mean evening/abends slots are available — evening confirmation requires 'deep_scan'.",
314+
"If the user asks about evening: only confirm if 'Evening:' lines exist below for that date.",
315+
"If no 'Evening:' line exists → say 'date is selectable but evening slots are not yet confirmed'.",
282316
"",
283317
f"{len(with_dates)} of {total} tents have open dates: {', '.join(with_dates)}.",
284318
]
@@ -291,13 +325,33 @@ def _compress_availability(raw: str) -> str:
291325
if errors:
292326
lines.append(f"Errors: {', '.join(errors)}.")
293327

328+
# Include exact dates per portal so LLM can answer date-specific queries
329+
if date_details:
330+
lines.append("")
331+
lines.append("Available dates per portal (selectable in dropdown, NOT confirmed evening):")
332+
lines.extend(date_details)
333+
334+
has_evening = False
335+
deep_scanned_no_evening: list[str] = []
294336
for r in results:
295337
ds = r.get("deep_scan")
296338
if ds:
297339
for d in ds:
298340
slots = d.get("abend_slots", [])
299341
if slots:
300-
lines.append(f"Evening: {r['portal']}{d['datum']}: {', '.join(slots)}")
342+
lines.append(f"CONFIRMED Evening: {r['portal']}{d['datum']}: {', '.join(slots)}")
343+
has_evening = True
344+
else:
345+
deep_scanned_no_evening.append(r.get("portal", "?"))
346+
347+
if deep_scanned_no_evening:
348+
lines.append("")
349+
lines.append(f"Deep-scanned but NO evening slots: {', '.join(deep_scanned_no_evening)}.")
350+
lines.append("These tents have the date selectable but NO evening time slots are available.")
351+
352+
if not has_evening and not deep_scanned_no_evening:
353+
lines.append("")
354+
lines.append("No evening slots confirmed in this scan. Dates above are only selectable in the dropdown — time slot availability is unknown until deep-scanned.")
301355

302356
return "\n".join(lines)
303357

@@ -425,7 +479,7 @@ def _build_workflow(self) -> WorkflowAgent:
425479
)
426480

427481
config_ctx = self._build_config_context()
428-
agent_opts = OpenAIChatCompletionOptions(temperature=0.4, max_tokens=2048)
482+
agent_opts = OpenAIChatCompletionOptions(temperature=0.2, max_tokens=2048)
429483

430484
# Shared middleware: progress reporting + result compression.
431485
# Captures self._on_progress which is updated before each chat() call.

0 commit comments

Comments
 (0)