LSX-UniWue
diff --git a/‎pyproject.toml‎
Lines changed: 1 addition & 1 deletion b/‎pyproject.toml‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎src/agent.py‎
Lines changed: 4 additions & 1 deletion b/‎src/agent.py‎
Lines changed: 4 additions & 1 deletion
diff --git a/‎src/evaluator/agent_interface.py‎
Lines changed: 6 additions & 1 deletion b/‎src/evaluator/agent_interface.py‎
Lines changed: 6 additions & 1 deletion
diff --git a/‎src/evaluator/cache.py‎
Lines changed: 209 additions & 0 deletions b/‎src/evaluator/cache.py‎
Lines changed: 209 additions & 0 deletions
diff --git a/‎src/evaluator/main.py‎
Lines changed: 59 additions & 3 deletions b/‎src/evaluator/main.py‎
Lines changed: 59 additions & 3 deletions
@@ -8,7 +8,7 @@ dependencies = [
     "pydantic>=2.12.5",
     "uvicorn>=0.38.0",
     "python-dotenv",
-    "litellm",
+    "litellm[caching]",
     "langgraph>=0.2.0",
     "langchain-core>=0.3.0",
     "langchain-community>=0.3.0",
 
@@ -170,14 +170,17 @@ async def run(self, message: Message, updater: TaskUpdater) -> None:
         evaluator_max_tokens = evaluator_config.get("max_tokens", 2000)
         api_key = evaluator_config.get("api_key") or os.getenv("OPENAI_API_KEY")
         base_url = evaluator_config.get("base_url") or os.getenv("OPENAI_BASE_URL")
+        prompt_style = evaluator_config.get("prompt_style") or os.getenv("PROMPT_STYLE", "default")
+        print(f"Using prompt style: {prompt_style}")
 
         step_evaluator = StepEvaluator(
             model=evaluator_model,
             max_tokens=evaluator_max_tokens,
             api_key=api_key,
             base_url=base_url,
             evaluation_protocol=evaluation_protocol,
-            task_mode=task_mode
+            task_mode=task_mode,
+            prompt_style=prompt_style
         )
 
         # Create workflow
 
@@ -10,6 +10,9 @@
 
 from ..messenger import Messenger
 
+# Import caching utilities
+from .cache import init_cache, is_cache_enabled
+
 
 class AgentInterface(Runnable):
     """Abstract base class for agent under test.
@@ -86,6 +89,9 @@ def __init__(
         if self.base_url:
             os.environ["OPENAI_BASE_URL"] = self.base_url
 
+        # Initialize LLM caching (no-op if already initialized)
+        init_cache(verbose=not is_cache_enabled())
+        
         # Set system prompt based on task mode (not protocol)
         if self.task_mode == "command":
             self.system_prompt = (
@@ -453,4 +459,3 @@ def create_agent_interface(config: Dict[str, Any]) -> AgentInterface:
         )
     else:
         raise ValueError(f"Invalid mode: {mode}. Must be 'internal' or 'a2a'")
-
 
@@ -0,0 +1,209 @@
+"""LLM caching configuration for the evaluator.
+
+This module provides centralized caching setup for all LLM calls in the evaluator.
+Uses LiteLLM's built-in disk caching to reduce API costs and speed up repeated runs.
+"""
+
+import os
+import litellm
+from pathlib import Path
+from typing import Optional
+
+# ANSI color codes for terminal output
+class Colors:
+    """ANSI color codes for terminal output."""
+    GREEN = '\033[92m'
+    YELLOW = '\033[93m'
+    BLUE = '\033[94m'
+    RESET = '\033[0m'
+
+
+# Global flags to track cache state
+_cache_initialized = False
+_cache_disabled = False
+
+
+def disable_cache(verbose: bool = True) -> None:
+    """Globally disable LLM caching.
+    
+    Call this BEFORE any init_cache() calls to prevent caching.
+    Once disabled, init_cache() becomes a no-op.
+    
+    Args:
+        verbose: Whether to print status message
+    """
+    global _cache_disabled
+    _cache_disabled = True
+    if verbose:
+        print(f"{Colors.YELLOW}⚠ LLM caching disabled globally{Colors.RESET}")
+
+
+def is_cache_disabled() -> bool:
+    """Check if caching has been globally disabled."""
+    return _cache_disabled
+
+
+def get_cache_dir() -> Path:
+    """Get the cache directory path.
+    
+    Returns:
+        Path to the cache directory (defaults to .litellm_cache in project root)
+    """
+    cache_dir = os.getenv("LITELLM_CACHE_DIR", ".litellm_cache")
+    return Path(cache_dir)
+
+
+def init_cache(
+    cache_type: str = "disk",
+    cache_dir: Optional[str] = None,
+    ttl: Optional[int] = None,
+    verbose: bool = True
+) -> bool:
+    """Initialize LiteLLM caching.
+    
+    This should be called once at startup. Subsequent calls are no-ops.
+    If caching has been globally disabled via disable_cache() or the
+    LITELLM_CACHE_DISABLED environment variable is set to "true", this is a no-op.
+    
+    Args:
+        cache_type: Type of cache ("disk", "redis", "s3", or "local" for in-memory)
+        cache_dir: Directory for disk cache (default: .litellm_cache)
+        ttl: Time-to-live for cache entries in seconds (default: None = forever)
+        verbose: Whether to print cache initialization status
+        
+    Returns:
+        True if cache was initialized, False if already initialized or disabled
+    """
+    global _cache_initialized, _cache_disabled
+    
+    # Check environment variable for cache disable
+    env_disabled = os.getenv("LITELLM_CACHE_DISABLED", "").lower() in ("true", "1", "yes")
+    if env_disabled and not _cache_disabled:
+        _cache_disabled = True
+        if verbose:
+            print(f"{Colors.YELLOW}⚠ LLM caching disabled via LITELLM_CACHE_DISABLED{Colors.RESET}")
+    
+    # Respect global disable flag
+    if _cache_disabled:
+        return False
+    
+    if _cache_initialized:
+        if verbose:
+            print(f"{Colors.YELLOW}⚠ LLM cache already initialized{Colors.RESET}")
+        return False
+    
+    # Set cache directory
+    if cache_dir is None:
+        cache_dir = str(get_cache_dir())
+    
+    # Ensure cache directory exists for disk cache
+    if cache_type == "disk":
+        Path(cache_dir).mkdir(parents=True, exist_ok=True)
+    
+    # Configure LiteLLM cache
+    cache_params = {
+        "type": cache_type,
+    }
+    
+    if cache_type == "disk":
+        cache_params["disk_cache_dir"] = cache_dir
+    
+    if ttl is not None:
+        cache_params["ttl"] = ttl
+    
+    # Initialize the cache
+    litellm.cache = litellm.Cache(**cache_params)
+    
+    # Enable caching globally
+    litellm.enable_cache()
+    
+    _cache_initialized = True
+    
+    if verbose:
+        print(f"{Colors.GREEN}✓ LLM caching enabled{Colors.RESET}")
+        print(f"  Type: {cache_type}")
+        if cache_type == "disk":
+            print(f"  Directory: {cache_dir}")
+        if ttl:
+            print(f"  TTL: {ttl}s")
+    
+    return True
+
+
+def is_cache_enabled() -> bool:
+    """Check if caching is currently enabled.
+    
+    Returns:
+        True if caching is enabled
+    """
+    return _cache_initialized and litellm.cache is not None
+
+
+def get_cache_stats() -> dict:
+    """Get cache statistics (if available).
+    
+    Returns:
+        Dictionary with cache statistics including disk cache info
+    """
+    stats = {
+        "initialized": _cache_initialized,
+    }
+    
+    # Always check disk cache stats (even if not initialized in this process)
+    cache_dir = get_cache_dir()
+    if cache_dir.exists():
+        cache_files = list(cache_dir.glob("*"))
+        stats["cache_dir"] = str(cache_dir)
+        stats["cache_files"] = len(cache_files)
+        stats["cache_size_bytes"] = sum(f.stat().st_size for f in cache_files if f.is_file())
+        stats["cache_size_mb"] = round(stats["cache_size_bytes"] / (1024 * 1024), 2)
+        stats["has_cache"] = stats["cache_files"] > 0
+    else:
+        stats["cache_dir"] = str(cache_dir)
+        stats["cache_files"] = 0
+        stats["cache_size_mb"] = 0
+        stats["has_cache"] = False
+    
+    return stats
+
+
+def clear_cache(verbose: bool = True) -> bool:
+    """Clear the LLM cache.
+    
+    Args:
+        verbose: Whether to print status
+        
+    Returns:
+        True if cache was cleared successfully
+    """
+    import shutil
+    
+    cache_dir = get_cache_dir()
+    
+    if cache_dir.exists():
+        # Get stats before clearing
+        files_count = len(list(cache_dir.glob("*")))
+        
+        # Remove cache directory
+        shutil.rmtree(cache_dir)
+        cache_dir.mkdir(parents=True, exist_ok=True)
+        
+        if verbose:
+            print(f"{Colors.GREEN}✓ Cache cleared ({files_count} files removed){Colors.RESET}")
+        return True
+    else:
+        if verbose:
+            print(f"{Colors.YELLOW}⚠ Cache directory does not exist{Colors.RESET}")
+        return False
+
+
+def print_cache_status():
+    """Print current cache status to console."""
+    stats = get_cache_stats()
+    
+    print(f"\n{Colors.BLUE}=== LLM Cache Status ==={Colors.RESET}")
+    print(f"  Directory: {stats.get('cache_dir', 'N/A')}")
+    print(f"  Has cached data: {stats.get('has_cache', False)}")
+    print(f"  Files: {stats.get('cache_files', 0)}")
+    print(f"  Size: {stats.get('cache_size_mb', 0)} MB")
+    print()
@@ -11,6 +11,7 @@
 from .step_evaluator import StepEvaluator
 from .workflow import EvaluatorWorkflow
 from .utils import save_evaluation_results, discover_all_challenges
+from .cache import init_cache, clear_cache, print_cache_status, get_cache_stats, disable_cache
 
 # ANSI color codes for terminal output
 class Colors:
@@ -149,6 +150,19 @@ def parse_args():
         default="command",
         help="Task mode: 'command' (predict commands), 'anticipated_result' (predict information/state changes), 'goal' (predict step goal) (default: command)"
     )
+    parser.add_argument(
+        "--prompt-style",
+        type=str,
+        choices=["default", "cot", "rubric", "minimal", "original"],
+        default="default",
+        help="Evaluation prompt style: 'default' (semantic + examples), 'cot' (chain-of-thought), 'rubric' (point-based scoring), 'minimal' (fast binary), 'original' (pre-refinement baseline) (default: default)"
+    )
+    parser.add_argument(
+        "--max-steps",
+        type=int,
+        default=None,
+        help="Maximum number of steps to evaluate (default: all steps)"
+    )
     parser.add_argument(
         "--writeups-path",
         type=str,
@@ -190,6 +204,23 @@ def parse_args():
         help="Disable Phoenix tracing"
     )
 
+    # Cache options
+    parser.add_argument(
+        "--no-cache",
+        action="store_true",
+        help="Disable LLM response caching"
+    )
+    parser.add_argument(
+        "--clear-cache",
+        action="store_true",
+        help="Clear the LLM cache before running"
+    )
+    parser.add_argument(
+        "--cache-status",
+        action="store_true",
+        help="Show cache status and exit"
+    )
+    
     return parser.parse_args()
 
 
@@ -201,6 +232,23 @@ def main():
     # Parse arguments
     args = parse_args()
 
+    # Handle cache-only commands first
+    if args.cache_status:
+        print_cache_status()
+        return 0
+    
+    # Clear cache if requested
+    if args.clear_cache:
+        print("Clearing LLM cache...")
+        clear_cache(verbose=True)
+    
+    # Initialize caching early (unless disabled)
+    if not args.no_cache:
+        init_cache(verbose=True)
+    else:
+        # Globally disable caching - prevents init_cache() calls in submodules
+        disable_cache(verbose=True)
+    
     # Validate incompatible option combinations
     if args.task_mode == "goal" and args.include_goal == "always":
         print("Error: When task_mode is 'goal', --include-goal cannot be 'always'")
@@ -289,6 +337,7 @@ def main():
     else:
         print(f"Agent URL: {args.agent_url}")
     print(f"Evaluator Model: {args.evaluator_model}")
+    print(f"Prompt Style: {args.prompt_style}")
     print(f"Max Iterations per Step: {args.max_iterations}")
     print(f"Output: {output_path}")
     print("=" * 70)
@@ -300,14 +349,15 @@ def main():
         agent_interface = create_agent_interface(agent_config)
 
         # Create step evaluator
-        print("Initializing step evaluator...")
+        print(f"Initializing step evaluator (prompt style: {args.prompt_style})...")
         step_evaluator = StepEvaluator(
             model=args.evaluator_model,
             max_tokens=args.evaluator_max_tokens,
             api_key=api_key,
             base_url=base_url,
             evaluation_protocol=args.evaluation_protocol,
-            task_mode=args.task_mode
+            task_mode=args.task_mode,
+            prompt_style=args.prompt_style
         )
 
         # Create workflow
@@ -320,6 +370,7 @@ def main():
             agent_interface=agent_interface,
             step_evaluator=step_evaluator,
             max_iterations_per_step=args.max_iterations,
+            max_steps=args.max_steps,
             enable_phoenix=not args.no_phoenix,
             include_goal=args.include_goal,
             include_tactic=args.include_tactic,
@@ -384,6 +435,12 @@ def main():
         print(f"Results saved to: {output_path}")
         print("=" * 70)
 
+        # Show cache status at the end
+        if not args.no_cache:
+            cache_stats = get_cache_stats()
+            if cache_stats.get("enabled"):
+                print(f"\n{Colors.BLUE}Cache Stats:{Colors.RESET} {cache_stats.get('cache_files', 0)} files, {cache_stats.get('cache_size_mb', 0)} MB")
+        
         return 0
 
     except KeyboardInterrupt:
@@ -399,4 +456,3 @@ def main():
 
 if __name__ == "__main__":
     sys.exit(main())
-