Skip to content

Commit 7f8300c

Browse files
authored
Merge pull request #1 from LSX-UniWue/refined-evaluation-prompts
Add refined evaluation prompts and LiteLLM caching
2 parents 36f4903 + b0bf9c5 commit 7f8300c

File tree

17 files changed

+1751
-725
lines changed

17 files changed

+1751
-725
lines changed

pyproject.toml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -8,7 +8,7 @@ dependencies = [
88
"pydantic>=2.12.5",
99
"uvicorn>=0.38.0",
1010
"python-dotenv",
11-
"litellm",
11+
"litellm[caching]",
1212
"langgraph>=0.2.0",
1313
"langchain-core>=0.3.0",
1414
"langchain-community>=0.3.0",

src/agent.py

Lines changed: 4 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -170,14 +170,17 @@ async def run(self, message: Message, updater: TaskUpdater) -> None:
170170
evaluator_max_tokens = evaluator_config.get("max_tokens", 2000)
171171
api_key = evaluator_config.get("api_key") or os.getenv("OPENAI_API_KEY")
172172
base_url = evaluator_config.get("base_url") or os.getenv("OPENAI_BASE_URL")
173+
prompt_style = evaluator_config.get("prompt_style") or os.getenv("PROMPT_STYLE", "default")
174+
print(f"Using prompt style: {prompt_style}")
173175

174176
step_evaluator = StepEvaluator(
175177
model=evaluator_model,
176178
max_tokens=evaluator_max_tokens,
177179
api_key=api_key,
178180
base_url=base_url,
179181
evaluation_protocol=evaluation_protocol,
180-
task_mode=task_mode
182+
task_mode=task_mode,
183+
prompt_style=prompt_style
181184
)
182185

183186
# Create workflow

src/evaluator/agent_interface.py

Lines changed: 6 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -10,6 +10,9 @@
1010

1111
from ..messenger import Messenger
1212

13+
# Import caching utilities
14+
from .cache import init_cache, is_cache_enabled
15+
1316

1417
class AgentInterface(Runnable):
1518
"""Abstract base class for agent under test.
@@ -86,6 +89,9 @@ def __init__(
8689
if self.base_url:
8790
os.environ["OPENAI_BASE_URL"] = self.base_url
8891

92+
# Initialize LLM caching (no-op if already initialized)
93+
init_cache(verbose=not is_cache_enabled())
94+
8995
# Set system prompt based on task mode (not protocol)
9096
if self.task_mode == "command":
9197
self.system_prompt = (
@@ -453,4 +459,3 @@ def create_agent_interface(config: Dict[str, Any]) -> AgentInterface:
453459
)
454460
else:
455461
raise ValueError(f"Invalid mode: {mode}. Must be 'internal' or 'a2a'")
456-

src/evaluator/cache.py

Lines changed: 209 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,209 @@
1+
"""LLM caching configuration for the evaluator.
2+
3+
This module provides centralized caching setup for all LLM calls in the evaluator.
4+
Uses LiteLLM's built-in disk caching to reduce API costs and speed up repeated runs.
5+
"""
6+
7+
import os
8+
import litellm
9+
from pathlib import Path
10+
from typing import Optional
11+
12+
# ANSI color codes for terminal output
13+
class Colors:
14+
"""ANSI color codes for terminal output."""
15+
GREEN = '\033[92m'
16+
YELLOW = '\033[93m'
17+
BLUE = '\033[94m'
18+
RESET = '\033[0m'
19+
20+
21+
# Global flags to track cache state
22+
_cache_initialized = False
23+
_cache_disabled = False
24+
25+
26+
def disable_cache(verbose: bool = True) -> None:
27+
"""Globally disable LLM caching.
28+
29+
Call this BEFORE any init_cache() calls to prevent caching.
30+
Once disabled, init_cache() becomes a no-op.
31+
32+
Args:
33+
verbose: Whether to print status message
34+
"""
35+
global _cache_disabled
36+
_cache_disabled = True
37+
if verbose:
38+
print(f"{Colors.YELLOW}⚠ LLM caching disabled globally{Colors.RESET}")
39+
40+
41+
def is_cache_disabled() -> bool:
42+
"""Check if caching has been globally disabled."""
43+
return _cache_disabled
44+
45+
46+
def get_cache_dir() -> Path:
47+
"""Get the cache directory path.
48+
49+
Returns:
50+
Path to the cache directory (defaults to .litellm_cache in project root)
51+
"""
52+
cache_dir = os.getenv("LITELLM_CACHE_DIR", ".litellm_cache")
53+
return Path(cache_dir)
54+
55+
56+
def init_cache(
57+
cache_type: str = "disk",
58+
cache_dir: Optional[str] = None,
59+
ttl: Optional[int] = None,
60+
verbose: bool = True
61+
) -> bool:
62+
"""Initialize LiteLLM caching.
63+
64+
This should be called once at startup. Subsequent calls are no-ops.
65+
If caching has been globally disabled via disable_cache() or the
66+
LITELLM_CACHE_DISABLED environment variable is set to "true", this is a no-op.
67+
68+
Args:
69+
cache_type: Type of cache ("disk", "redis", "s3", or "local" for in-memory)
70+
cache_dir: Directory for disk cache (default: .litellm_cache)
71+
ttl: Time-to-live for cache entries in seconds (default: None = forever)
72+
verbose: Whether to print cache initialization status
73+
74+
Returns:
75+
True if cache was initialized, False if already initialized or disabled
76+
"""
77+
global _cache_initialized, _cache_disabled
78+
79+
# Check environment variable for cache disable
80+
env_disabled = os.getenv("LITELLM_CACHE_DISABLED", "").lower() in ("true", "1", "yes")
81+
if env_disabled and not _cache_disabled:
82+
_cache_disabled = True
83+
if verbose:
84+
print(f"{Colors.YELLOW}⚠ LLM caching disabled via LITELLM_CACHE_DISABLED{Colors.RESET}")
85+
86+
# Respect global disable flag
87+
if _cache_disabled:
88+
return False
89+
90+
if _cache_initialized:
91+
if verbose:
92+
print(f"{Colors.YELLOW}⚠ LLM cache already initialized{Colors.RESET}")
93+
return False
94+
95+
# Set cache directory
96+
if cache_dir is None:
97+
cache_dir = str(get_cache_dir())
98+
99+
# Ensure cache directory exists for disk cache
100+
if cache_type == "disk":
101+
Path(cache_dir).mkdir(parents=True, exist_ok=True)
102+
103+
# Configure LiteLLM cache
104+
cache_params = {
105+
"type": cache_type,
106+
}
107+
108+
if cache_type == "disk":
109+
cache_params["disk_cache_dir"] = cache_dir
110+
111+
if ttl is not None:
112+
cache_params["ttl"] = ttl
113+
114+
# Initialize the cache
115+
litellm.cache = litellm.Cache(**cache_params)
116+
117+
# Enable caching globally
118+
litellm.enable_cache()
119+
120+
_cache_initialized = True
121+
122+
if verbose:
123+
print(f"{Colors.GREEN}✓ LLM caching enabled{Colors.RESET}")
124+
print(f" Type: {cache_type}")
125+
if cache_type == "disk":
126+
print(f" Directory: {cache_dir}")
127+
if ttl:
128+
print(f" TTL: {ttl}s")
129+
130+
return True
131+
132+
133+
def is_cache_enabled() -> bool:
134+
"""Check if caching is currently enabled.
135+
136+
Returns:
137+
True if caching is enabled
138+
"""
139+
return _cache_initialized and litellm.cache is not None
140+
141+
142+
def get_cache_stats() -> dict:
143+
"""Get cache statistics (if available).
144+
145+
Returns:
146+
Dictionary with cache statistics including disk cache info
147+
"""
148+
stats = {
149+
"initialized": _cache_initialized,
150+
}
151+
152+
# Always check disk cache stats (even if not initialized in this process)
153+
cache_dir = get_cache_dir()
154+
if cache_dir.exists():
155+
cache_files = list(cache_dir.glob("*"))
156+
stats["cache_dir"] = str(cache_dir)
157+
stats["cache_files"] = len(cache_files)
158+
stats["cache_size_bytes"] = sum(f.stat().st_size for f in cache_files if f.is_file())
159+
stats["cache_size_mb"] = round(stats["cache_size_bytes"] / (1024 * 1024), 2)
160+
stats["has_cache"] = stats["cache_files"] > 0
161+
else:
162+
stats["cache_dir"] = str(cache_dir)
163+
stats["cache_files"] = 0
164+
stats["cache_size_mb"] = 0
165+
stats["has_cache"] = False
166+
167+
return stats
168+
169+
170+
def clear_cache(verbose: bool = True) -> bool:
171+
"""Clear the LLM cache.
172+
173+
Args:
174+
verbose: Whether to print status
175+
176+
Returns:
177+
True if cache was cleared successfully
178+
"""
179+
import shutil
180+
181+
cache_dir = get_cache_dir()
182+
183+
if cache_dir.exists():
184+
# Get stats before clearing
185+
files_count = len(list(cache_dir.glob("*")))
186+
187+
# Remove cache directory
188+
shutil.rmtree(cache_dir)
189+
cache_dir.mkdir(parents=True, exist_ok=True)
190+
191+
if verbose:
192+
print(f"{Colors.GREEN}✓ Cache cleared ({files_count} files removed){Colors.RESET}")
193+
return True
194+
else:
195+
if verbose:
196+
print(f"{Colors.YELLOW}⚠ Cache directory does not exist{Colors.RESET}")
197+
return False
198+
199+
200+
def print_cache_status():
201+
"""Print current cache status to console."""
202+
stats = get_cache_stats()
203+
204+
print(f"\n{Colors.BLUE}=== LLM Cache Status ==={Colors.RESET}")
205+
print(f" Directory: {stats.get('cache_dir', 'N/A')}")
206+
print(f" Has cached data: {stats.get('has_cache', False)}")
207+
print(f" Files: {stats.get('cache_files', 0)}")
208+
print(f" Size: {stats.get('cache_size_mb', 0)} MB")
209+
print()

src/evaluator/main.py

Lines changed: 59 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -11,6 +11,7 @@
1111
from .step_evaluator import StepEvaluator
1212
from .workflow import EvaluatorWorkflow
1313
from .utils import save_evaluation_results, discover_all_challenges
14+
from .cache import init_cache, clear_cache, print_cache_status, get_cache_stats, disable_cache
1415

1516
# ANSI color codes for terminal output
1617
class Colors:
@@ -149,6 +150,19 @@ def parse_args():
149150
default="command",
150151
help="Task mode: 'command' (predict commands), 'anticipated_result' (predict information/state changes), 'goal' (predict step goal) (default: command)"
151152
)
153+
parser.add_argument(
154+
"--prompt-style",
155+
type=str,
156+
choices=["default", "cot", "rubric", "minimal", "original"],
157+
default="default",
158+
help="Evaluation prompt style: 'default' (semantic + examples), 'cot' (chain-of-thought), 'rubric' (point-based scoring), 'minimal' (fast binary), 'original' (pre-refinement baseline) (default: default)"
159+
)
160+
parser.add_argument(
161+
"--max-steps",
162+
type=int,
163+
default=None,
164+
help="Maximum number of steps to evaluate (default: all steps)"
165+
)
152166
parser.add_argument(
153167
"--writeups-path",
154168
type=str,
@@ -190,6 +204,23 @@ def parse_args():
190204
help="Disable Phoenix tracing"
191205
)
192206

207+
# Cache options
208+
parser.add_argument(
209+
"--no-cache",
210+
action="store_true",
211+
help="Disable LLM response caching"
212+
)
213+
parser.add_argument(
214+
"--clear-cache",
215+
action="store_true",
216+
help="Clear the LLM cache before running"
217+
)
218+
parser.add_argument(
219+
"--cache-status",
220+
action="store_true",
221+
help="Show cache status and exit"
222+
)
223+
193224
return parser.parse_args()
194225

195226

@@ -201,6 +232,23 @@ def main():
201232
# Parse arguments
202233
args = parse_args()
203234

235+
# Handle cache-only commands first
236+
if args.cache_status:
237+
print_cache_status()
238+
return 0
239+
240+
# Clear cache if requested
241+
if args.clear_cache:
242+
print("Clearing LLM cache...")
243+
clear_cache(verbose=True)
244+
245+
# Initialize caching early (unless disabled)
246+
if not args.no_cache:
247+
init_cache(verbose=True)
248+
else:
249+
# Globally disable caching - prevents init_cache() calls in submodules
250+
disable_cache(verbose=True)
251+
204252
# Validate incompatible option combinations
205253
if args.task_mode == "goal" and args.include_goal == "always":
206254
print("Error: When task_mode is 'goal', --include-goal cannot be 'always'")
@@ -289,6 +337,7 @@ def main():
289337
else:
290338
print(f"Agent URL: {args.agent_url}")
291339
print(f"Evaluator Model: {args.evaluator_model}")
340+
print(f"Prompt Style: {args.prompt_style}")
292341
print(f"Max Iterations per Step: {args.max_iterations}")
293342
print(f"Output: {output_path}")
294343
print("=" * 70)
@@ -300,14 +349,15 @@ def main():
300349
agent_interface = create_agent_interface(agent_config)
301350

302351
# Create step evaluator
303-
print("Initializing step evaluator...")
352+
print(f"Initializing step evaluator (prompt style: {args.prompt_style})...")
304353
step_evaluator = StepEvaluator(
305354
model=args.evaluator_model,
306355
max_tokens=args.evaluator_max_tokens,
307356
api_key=api_key,
308357
base_url=base_url,
309358
evaluation_protocol=args.evaluation_protocol,
310-
task_mode=args.task_mode
359+
task_mode=args.task_mode,
360+
prompt_style=args.prompt_style
311361
)
312362

313363
# Create workflow
@@ -320,6 +370,7 @@ def main():
320370
agent_interface=agent_interface,
321371
step_evaluator=step_evaluator,
322372
max_iterations_per_step=args.max_iterations,
373+
max_steps=args.max_steps,
323374
enable_phoenix=not args.no_phoenix,
324375
include_goal=args.include_goal,
325376
include_tactic=args.include_tactic,
@@ -384,6 +435,12 @@ def main():
384435
print(f"Results saved to: {output_path}")
385436
print("=" * 70)
386437

438+
# Show cache status at the end
439+
if not args.no_cache:
440+
cache_stats = get_cache_stats()
441+
if cache_stats.get("enabled"):
442+
print(f"\n{Colors.BLUE}Cache Stats:{Colors.RESET} {cache_stats.get('cache_files', 0)} files, {cache_stats.get('cache_size_mb', 0)} MB")
443+
387444
return 0
388445

389446
except KeyboardInterrupt:
@@ -399,4 +456,3 @@ def main():
399456

400457
if __name__ == "__main__":
401458
sys.exit(main())
402-

0 commit comments

Comments
 (0)