llm_tournament.py

#!/usr/bin/env python3
"""
LLM Multi-Round Coding Tournament Automation

This script automates the process of running a multi-round coding tournament between
different LLMs, where models analyze and improve each other's solutions across
multiple rounds of refinement and integration.

Features:
- Supports multiple LLM providers through the aisuite API
- Creates a structured directory hierarchy for organizing tournament artifacts
- Manages the full lifecycle of prompt creation, response collection, and analysis
- Generates test scripts to evaluate the performance of each solution
- Tracks and reports detailed metrics about each solution
- Supports configurable tournament parameters (rounds, models, etc.)
- Handles error recovery and rate limiting
"""

import os
import re
import time
import json
import logging
import argparse
import traceback
import statistics
import subprocess
import textwrap
import threading
from typing import List, Dict, Tuple, Any, Optional, Union, Set
from dataclasses import dataclass, field
from datetime import datetime
from pathlib import Path
from concurrent.futures import ThreadPoolExecutor, as_completed
from dotenv import load_dotenv
import aisuite as ai

load_dotenv()  # This will load variables from your .env file

# Configure logging
logging.basicConfig(
    level=logging.INFO,
    format='%(asctime)s - %(name)s - %(levelname)s - %(message)s',
    handlers=[
        logging.FileHandler("tournament.log"),
        logging.StreamHandler()
    ]
)
logger = logging.getLogger("llm_tournament")

# Model configurations with appropriate parameters
MODELS = {
    "o3_mini": {
        "id": "openai:o3-mini",
        "thinking": True,
        "provider": "openai"
    },
    "gpt4o": {
        "id": "openai:gpt-4o",
        "thinking": False,
        "provider": "openai"
    },
    "claude37": {
        "id": "anthropic:claude-3-7-sonnet-20250219",
        "thinking": True,
        "provider": "anthropic"
    },
    "mistral_large": {
        "id": "mistral:mistral-large-latest",
        "thinking": True,
        "provider": "mistral"
    }
}

# Maximum retry attempts for API calls
MAX_RETRIES = 3

# Backoff times (in seconds) for retrying API calls
BACKOFF_TIMES = [5, 15, 30]

@dataclass
class ModelResponse:
    """Class for storing and processing a single model's response"""
    model_name: str
    round_num: int
    prompt: str
    response: str
    file_path: Optional[Path] = None
    code: Optional[str] = None
    thinking: Optional[str] = None
    error: Optional[str] = None
    metrics: Dict[str, Any] = field(default_factory=dict)
    timestamp: datetime = field(default_factory=datetime.now)
    
    def extract_code(self, output_dir: Optional[Path] = None) -> str:
        """
        Extract code from the model's response using the AI suite to process and structure it.
        If the code has already been extracted and saved to a file, load it from there instead.
        
        Args:
            output_dir: Base directory for tournament artifacts (optional)
            
        Returns:
            Extracted and cleaned code as a string
        """
        if not self.response:
            return ""
        
        # If code has already been extracted, return it
        if self.code:
            return self.code
            
        # If output_dir is provided, check if the extracted code file exists
        if output_dir:
            extracted_code_dir = output_dir / "extracted_code"
            extracted_code_dir.mkdir(exist_ok=True, parents=True)
            
            # Create filename from model name and round number
            code_filename = f"extracted_code__round_{self.round_num}__{self.model_name}.py"
            code_file_path = extracted_code_dir / code_filename
            
            # Check if the file already exists
            if code_file_path.exists():
                try:
                    logger.info(f"Loading previously extracted code for {self.model_name} Round {self.round_num} from file")
                    with open(code_file_path, "r", encoding="utf-8") as f:
                        self.code = f.read()
                    return self.code
                except Exception as e:
                    logger.warning(f"Error loading extracted code from file: {str(e)}. Re-extracting from response.")
        
        try:
            # Create a prompt for the AI to extract and transform the code
            prompt = f"""
    Extract all code from the text below and format it as a complete, self-contained class 
    with the name {self.model_name}Round{self.round_num}Solution.

    The class should:
    1. Include all necessary imports at the top
    2. Have a static 'solve' method that takes 'input_text' as its parameter
    3. Contain all functions from the original code
    4. Ensure the solve method correctly calls the main function with the input_text parameter
    5. Be properly indented and formatted for execution

    Important: Provide ONLY the complete code WITHOUT ANY explanations, comments about the task,
    or markdown formatting. Do not include any text before or after the code.

    Here is the text to extract code from:

    {self.response}
    """
            
            # Call the AI service
            client = ai.Client()
            print(f"Submitting model response to Claude3.7 to extract the code from {self.model_name} Round {self.round_num} and turn it into a self-contained class...")
            response = client.chat.completions.create(
                model="anthropic:claude-3-7-sonnet-20250219",
                messages=[{"role": "user", "content": prompt}],
                max_tokens=25000
            )
            
            # Get the extracted code
            extracted_code = response.choices[0].message.content
            
            # Remove any markdown formatting
            extracted_code = re.sub(r'^```(?:\w+)?\n', '', extracted_code)
            extracted_code = re.sub(r'\n```$', '', extracted_code)
            
            # Store the extracted code
            self.code = extracted_code.strip()
            
            # Save to file if output_dir is provided
            if output_dir:
                try:
                    with open(code_file_path, "w", encoding="utf-8") as f:
                        f.write(self.code)
                    logger.info(f"Saved extracted code to {code_file_path}")
                except Exception as e:
                    logger.error(f"Error saving extracted code to file: {str(e)}")
            
            return self.code
            
        except Exception as e:
            logger.error(f"Error using AI to extract code: {str(e)}")
            logger.debug(traceback.format_exc())
            
            # Fallback to a minimal implementation
            fallback_code = f"""
    class {self.model_name}Round{self.round_num}Solution:
        \"\"\"Solution from {self.model_name} at round {self.round_num}\"\"\"
        
        @staticmethod
        def solve(input_text):
            \"\"\"Apply the solution to the input text\"\"\"
            # Simple fallback implementation that returns the input
            return input_text
    """
            self.code = fallback_code.strip()
            
            # Save the fallback code to file if output_dir is provided
            if output_dir:
                try:
                    with open(code_file_path, "w", encoding="utf-8") as f:
                        f.write(self.code)
                    logger.info(f"Saved fallback code to {code_file_path}")
                except Exception as e:
                    logger.error(f"Error saving fallback code to file: {str(e)}")
            
            return self.code
        
    def extract_thinking(self) -> Optional[str]:
        """
        Extract the thought process or reasoning from the model's response.
        
        Returns:
            Extracted thinking content if found, None otherwise
        """
        # Look for chain-of-thought sections marked with common patterns
        patterns = [
            r'<thinking>(.*?)</thinking>',
            r'### Thinking\s*\n(.*?)(?:\n###|\Z)',
            r'## Thought Process\s*\n(.*?)(?:\n##|\Z)',
            r'Step-by-step reasoning:\s*\n(.*?)(?:\n#|\Z)',
            r'Let me think through this:\s*\n(.*?)(?:\nFinal answer:|\Z)'
        ]
        
        for pattern in patterns:
            matches = re.findall(pattern, self.response, re.DOTALL)
            if matches:
                self.thinking = matches[0].strip()
                return self.thinking
        
        return None
    
    def calculate_metrics(self) -> Dict[str, Any]:
        """
        Calculate various metrics about the response and code.
        
        Returns:
            Dictionary of calculated metrics
        """
        # Extract code if not already done
        if not self.code:
            self.extract_code()
            
        # Calculate basic metrics
        response_size = len(self.response)
        response_lines = self.response.count('\n') + 1
        
        code_size = len(self.code) if self.code else 0
        code_lines = self.code.count('\n') + 1 if self.code else 0
        
        # Compute code complexity metrics using the class-structured code
        complexity_metrics = self._compute_code_complexity()
        
        # Combine metrics
        metrics = {
            "response_size_kb": round(response_size / 1024, 2),
            "response_lines": response_lines,
            "code_size_kb": round(code_size / 1024, 2),
            "code_lines": code_lines,
            "timestamp": self.timestamp.isoformat(),
            **complexity_metrics
        }
        
        self.metrics = metrics
        return metrics

    def _compute_code_complexity(self) -> Dict[str, Any]:
        """
        Compute code complexity metrics like function count, class count, etc.
        from the class-structured code.
        
        Returns:
            Dictionary of complexity metrics
        """
        if not self.code:
            return {}
            
        # Count functions - now looking for methods and static methods in class structure
        function_count = len(re.findall(r'(?:^|\s)def\s+([a-zA-Z0-9_]+)\s*\(', self.code))
        
        # Count classes (should be at least 1 - the solution class)
        class_count = len(re.findall(r'(?:^|\s)class\s+([a-zA-Z0-9_]+)', self.code))
        
        # Count import statements
        import_count = len(re.findall(r'(?:^|\s)import\s+([a-zA-Z0-9_., ]+)', self.code))
        from_import_count = len(re.findall(r'(?:^|\s)from\s+([a-zA-Z0-9_.]+)\s+import', self.code))
        total_imports = import_count + from_import_count
        
        # Estimate cyclomatic complexity by counting decision points
        decision_patterns = [
            r'\bif\s+', r'\belif\s+', r'\belse\s*:', 
            r'\bfor\s+', r'\bwhile\s+', r'\bexcept\s*',
            r'\btry\s*:'
        ]
        
        decision_count = sum(len(re.findall(pattern, self.code)) for pattern in decision_patterns)
        
        return {
            "function_count": function_count,
            "class_count": class_count,
            "import_count": total_imports,
            "decision_points": decision_count,
            "complexity_estimate": decision_count / (function_count if function_count else 1)
        }
        
    def save_to_file(self, output_dir: Path) -> Path:
        """
        Save the response to a file.
        
        Args:
            output_dir: Directory where the file should be saved
            
        Returns:
            Path to the saved file
        """
        # Create the output directory if it doesn't exist
        output_dir.mkdir(exist_ok=True, parents=True)
        
        # Create a filename from the model name and round number
        filename = f"tournament_response__round_{self.round_num}__{self.model_name}.md"
        file_path = output_dir / filename
        
        # Write the response to the file
        with open(file_path, "w", encoding="utf-8") as f:
            f.write(self.response)
        
        self.file_path = file_path
        return file_path


class LLMTournament:
    """
    Main class for managing the multi-round LLM tournament.
    
    This class handles the full lifecycle of the tournament, including:
    - Creating and managing the directory structure
    - Querying LLMs for responses
    - Creating prompts for each round
    - Tracking metrics and generating reports
    - Testing solutions
    """
    
    def __init__(
        self, 
        prompt: str, 
        rounds: int = 5, 
        output_dir: str = "tournament_results",
        models: Dict[str, Dict[str, Any]] = None,
        temperature: float = 0.7,
        concurrent_requests: int = 2,
        test_file: Optional[str] = None,
        verbose: bool = False
    ):
        """
        Initialize the tournament with the given parameters.
        
        Args:
            prompt: The initial coding challenge prompt
            rounds: Number of rounds to run (not including round 0)
            output_dir: Base directory for tournament artifacts
            models: Dictionary of model configurations (defaults to MODELS)
            temperature: Temperature setting for generation
            concurrent_requests: Maximum number of concurrent API requests
            test_file: Optional path to a file for testing solutions
            verbose: Whether to enable verbose logging
        """
        self.prompt = prompt
        self.rounds = rounds
        self.output_dir = Path(output_dir)
        self.models = models or MODELS
        self.temperature = temperature
        self.concurrent_requests = concurrent_requests
        self.test_file = test_file
        self.verbose = verbose
        
        # Set logging level based on verbosity
        if verbose:
            logger.setLevel(logging.DEBUG)
        
        # Initialize the AI client
        self.client = ai.Client()
        
        # Store responses for each round and model
        self.responses: Dict[int, Dict[str, ModelResponse]] = {i: {} for i in range(rounds + 1)}
        
        # Initialize metrics tracking
        self.metrics: Dict[str, Any] = {}
        
        # Create directory structure
        self.setup_directories()
        
        # Log initialization
        logger.info(f"Initialized LLM Tournament with {len(self.models)} models for {rounds} rounds")
        logger.info(f"Output directory: {self.output_dir}")
        logger.debug(f"Models: {list(self.models.keys())}")

    def setup_directories(self) -> None:
        """
        Create the necessary directory structure for the tournament.
        
        Directory structure:
        - output_dir/
        - round_0_responses/
        - round_1_responses/
        - ...
        - round_N_responses/
        - output_results_for_each_round_and_model/
        - extracted_code/  # New directory for extracted code
        - metrics/
        """
        # Create main output directory
        self.output_dir.mkdir(exist_ok=True, parents=True)
        
        # Create round-specific directories
        for i in range(self.rounds + 1):
            round_dir = self.output_dir / f"round_{i}_responses"
            round_dir.mkdir(exist_ok=True)
        
        # Create output results directory
        results_dir = self.output_dir / "output_results_for_each_round_and_model"
        results_dir.mkdir(exist_ok=True)
        
        # Create extracted code directory (new)
        extracted_code_dir = self.output_dir / "extracted_code"
        extracted_code_dir.mkdir(exist_ok=True)
        
        # Create metrics directory
        metrics_dir = self.output_dir / "metrics"
        metrics_dir.mkdir(exist_ok=True)
        
        logger.debug(f"Created directory structure in {self.output_dir}")

    def query_model(
        self,
        model_name: str,
        round_num: int,
        prompt_text: str
    ) -> ModelResponse:
        """
        Query an LLM with the given prompt and handle retries and errors.
        
        Args:
            model_name: Name of the model to query
            round_num: Current tournament round number
            prompt_text: The prompt to send to the model
            
        Returns:
            ModelResponse object containing the model's response and metadata
        """
        # Check if we already have a valid response file for this model and round
        response_dir = self.output_dir / f"round_{round_num}_responses"
        response_filename = f"tournament_response__round_{round_num}__{model_name}.md"
        response_file_path = response_dir / response_filename
        
        if response_file_path.exists():
            logger.info(f"Found existing response file for {model_name} (round {round_num}). Loading...")
            try:
                # Read the existing response
                with open(response_file_path, "r", encoding="utf-8") as f:
                    response_text = f.read()
                
                # Create a ModelResponse object
                response_obj = ModelResponse(
                    model_name=model_name,
                    round_num=round_num,
                    prompt=prompt_text,
                    response=response_text,
                    file_path=response_file_path
                )
                
                # Extract code (will use cached file if available) and thinking components
                response_obj.extract_code(self.output_dir)
                response_obj.extract_thinking()
                
                # Calculate metrics
                response_obj.calculate_metrics()
                
                # Log success
                logger.info(f"Successfully loaded existing response for {model_name} (round {round_num})")
                
                # Store in the responses dictionary
                self.responses[round_num][model_name] = response_obj
                
                return response_obj
            except Exception as e:
                logger.warning(f"Error loading existing response for {model_name} (round {round_num}): {str(e)}. Requesting a new one.")
        
        model_config = self.models[model_name]
        model_id = model_config["id"]
        thinking_enabled = model_config.get("thinking", False)
        max_tokens = model_config.get("max_tokens", 4096)
        provider = model_config.get("provider", "")
        
        # Create system message based on model capabilities
        system_message = "You are an expert programmer specializing in writing clean, efficient, and robust code."
        
        if thinking_enabled:
            system_message += (
                " Please think through the problem carefully before providing your solution. "
                "Show your reasoning process and explain key design decisions."
            )
        
        logger.info(f"Querying {model_name} (round {round_num})...")
        
        # Create a ModelResponse object to store the result
        response_obj = ModelResponse(
            model_name=model_name,
            round_num=round_num,
            prompt=prompt_text,
            response=""
        )
        
        # Try the API call with retries
        for attempt in range(MAX_RETRIES):
            try:
                start_time = time.time()
                
                # Prepare API parameters based on provider and model
                api_params = {
                    "model": model_id,
                    "messages": [
                        {"role": "system", "content": system_message},
                        {"role": "user", "content": prompt_text}
                    ]
                }
                
                # Handle provider-specific parameters
                if provider.lower() == "openai":
                    # Check if this is o3-mini model which has parameter limitations
                    if "o3-mini" in model_id.lower():
                        # For o3-mini models, don't add temperature or max_tokens parameters
                        pass
                    else:
                        # For other OpenAI models, use these parameters
                        api_params["max_completion_tokens"] = max_tokens
                        api_params["temperature"] = self.temperature
                else:
                    # Other providers use standard parameters
                    api_params["max_tokens"] = max_tokens
                    api_params["temperature"] = self.temperature
                
                # Make the API call
                api_response = self.client.chat.completions.create(**api_params)
                
                # Calculate response time
                response_time = time.time() - start_time
                
                # Extract and store the response text
                response_text = api_response.choices[0].message.content
                response_obj.response = response_text
                response_obj.metrics["response_time"] = round(response_time, 2)
                
                # Extract code (and save to file) and thinking components
                response_obj.extract_code(self.output_dir)
                response_obj.extract_thinking()
                
                # Calculate metrics
                response_obj.calculate_metrics()
                
                # Log success
                logger.info(f"Received response from {model_name} ({round(response_time, 2)}s)")
                logger.debug(f"Response metrics: {response_obj.metrics}")
                
                # Store in the responses dictionary
                self.responses[round_num][model_name] = response_obj
                
                return response_obj
                
            except Exception as e:
                # Log the error
                error_msg = f"Error querying {model_name} (attempt {attempt+1}/{MAX_RETRIES}): {str(e)}"
                logger.error(error_msg)
                logger.debug(traceback.format_exc())
                
                # If we're seeing an unsupported parameter error, try to adapt
                if "Unsupported parameter" in str(e):
                    error_details = str(e)
                    logger.info(f"Detected unsupported parameter error. Adapting request for next attempt.")
                    
                    # Extract unsupported parameter name if possible
                    param_match = re.search(r"'([^']+)' is not supported", error_details)
                    if param_match and param_match.group(1) in api_params:
                        unsupported_param = param_match.group(1)
                        logger.info(f"Removing unsupported parameter: {unsupported_param}")
                        api_params.pop(unsupported_param, None)
                
                # Store error information
                response_obj.error = error_msg
                
                # Retry with backoff if not the last attempt
                if attempt < MAX_RETRIES - 1:
                    backoff_time = BACKOFF_TIMES[min(attempt, len(BACKOFF_TIMES) - 1)]
                    logger.info(f"Retrying in {backoff_time} seconds...")
                    time.sleep(backoff_time)
                else:
                    # If all retries failed, return the error response
                    response_obj.response = f"ERROR: Failed to get response after {MAX_RETRIES} attempts.\n\n{error_msg}"
                    return response_obj
        
        # This should never be reached, but just in case
        return response_obj

    def create_round_prompt(self, round_num: int) -> str:
        """
        Create the prompt for a specific round by combining responses from the previous round.
        
        Args:
            round_num: The round number to create a prompt for
            
        Returns:
            The combined prompt text for the next round
        """
        # For round 0, just use the original prompt
        if round_num == 0:
            return self.prompt
        
        # For subsequent rounds, combine the responses from the previous round
        prev_round = round_num - 1
        
        # Start with the standard combination prompt
        combined_prompt = f"""
I have the following problem which I posed to 4 different LLMs. I want you to carefully read the problem and then each solution. Choose the best ideas and elements from ALL solutions to the extent they are complementary rather than conflicting/inconsistent, and then weave together a true hybrid "best of all worlds" implementation which you are highly confident will not only work, but will outperform any of the individual solutions individually.

Original prompt:

{self.prompt}

Responses from different LLMs:
"""
        
        # Add each model's response
        for model_name, model_response in self.responses[prev_round].items():
            # For round 1, include the full response; for later rounds, just include the code
            if round_num == 1:
                content = model_response.response
            else:
                content = model_response.code or model_response.response
            
            combined_prompt += f"\n\n{model_name}:\n\n```python\n{content}\n```\n"
        
        # Add specific instructions for synthesis
        combined_prompt += """
Analyze each solution carefully, identifying strengths and weaknesses. Consider:
1. Correctness - Does the code handle all cases properly?
2. Efficiency - Is the code optimized for performance?
3. Readability - Is the code clear and maintainable?
4. Robustness - Does the code handle errors gracefully?

Then create a new implementation that combines the best aspects of all solutions.
Your implementation should be complete and ready to use without modification.
"""
        
        return combined_prompt

    def _all_responses_exist(self, round_num: int) -> bool:
        """
        Check if all response files for a specific round already exist.
        """
        response_dir = self.output_dir / f"round_{round_num}_responses"
        
        for model_name in self.models.keys():
            response_filename = f"tournament_response__round_{round_num}__{model_name}.md"
            response_file_path = response_dir / response_filename
            
            if not response_file_path.exists():
                return False
        
        return True

    def _load_existing_responses(self, round_num: int) -> Dict[str, ModelResponse]:
        """
        Load existing response files for a specific round.
        """
        round_responses = {}
        response_dir = self.output_dir / f"round_{round_num}_responses"
        
        # Try to load the prompt for this round
        prompt_text = ""
        prompt_file = self.output_dir / f"prompt_round_{round_num}.md"
        if prompt_file.exists():
            try:
                with open(prompt_file, "r", encoding="utf-8") as f:
                    prompt_text = f.read()
            except Exception as e:
                logger.warning(f"Error loading prompt for round {round_num}: {str(e)}")
        
        for model_name in self.models.keys():
            response_filename = f"tournament_response__round_{round_num}__{model_name}.md"
            response_file_path = response_dir / response_filename
            
            if response_file_path.exists():
                try:
                    # Read the existing response
                    with open(response_file_path, "r", encoding="utf-8") as f:
                        response_text = f.read()
                    
                    # Create a ModelResponse object
                    response_obj = ModelResponse(
                        model_name=model_name,
                        round_num=round_num,
                        prompt=prompt_text,
                        response=response_text,
                        file_path=response_file_path
                    )
                    
                    # Extract code (will use cached file if available) and thinking components
                    response_obj.extract_code(self.output_dir)
                    response_obj.extract_thinking()
                    
                    # Calculate metrics
                    response_obj.calculate_metrics()
                    
                    # Store in responses dictionaries
                    round_responses[model_name] = response_obj
                    self.responses[round_num][model_name] = response_obj
                    
                    logger.info(f"Loaded existing response for {model_name} (round {round_num})")
                except Exception as e:
                    logger.error(f"Error loading existing response for {model_name} (round {round_num}): {str(e)}")
        
        # Create comparison file and update metrics
        self.create_round_comparison_file(round_num)
        self.update_metrics(round_num, round_responses)
        
        return round_responses

    def run_round(self, round_num: int) -> Dict[str, ModelResponse]:
        """
        Run a single round of the tournament, querying all models in parallel.
        """
        logger.info(f"Starting Round {round_num}")
        
        # Check if all response files for this round already exist
        if self._all_responses_exist(round_num):
            logger.info(f"All responses for round {round_num} already exist. Loading...")
            return self._load_existing_responses(round_num)
        
        # Check if the next round is already complete, which means we can skip prompt creation
        # for this round and just load whatever responses exist
        if round_num < self.rounds and self._all_responses_exist(round_num + 1):
            logger.info(f"Next round {round_num + 1} is already complete. Loading existing responses for round {round_num}...")
            return self._load_existing_responses(round_num)

        # Create the prompt for this round
        round_prompt = self.create_round_prompt(round_num)
        
        # Save the prompt for reference
        prompt_file = self.output_dir / f"prompt_round_{round_num}.md"
        with open(prompt_file, "w", encoding="utf-8") as f:
            f.write(round_prompt)
        
        # Query all models in parallel
        round_responses = {}
        
        with ThreadPoolExecutor(max_workers=self.concurrent_requests) as executor:
            # Submit all model queries
            future_to_model = {
                executor.submit(self.query_model, model_name, round_num, round_prompt): model_name
                for model_name in self.models.keys()
            }
            
            # Process results as they complete
            for future in as_completed(future_to_model):
                model_name = future_to_model[future]
                try:
                    response = future.result()
                    round_responses[model_name] = response
                    
                    # Save the response to a file
                    response_dir = self.output_dir / f"round_{round_num}_responses"
                    response.save_to_file(response_dir)
                    
                    logger.info(f"Saved response from {model_name} for round {round_num}")
                    
                except Exception as e:
                    logger.error(f"Error processing response from {model_name}: {str(e)}")
                    logger.debug(traceback.format_exc())
        
        # Create comparison file for all responses in this round
        self.create_round_comparison_file(round_num)
        
        # Update metrics
        self.update_metrics(round_num, round_responses)
        
        # Return the responses
        return round_responses

    def create_round_comparison_file(self, round_num: int) -> Path:
        """
        Create a markdown file comparing all responses for a specific round.
        
        Args:
            round_num: The round number to create a comparison for
            
        Returns:
            Path to the created comparison file
        """
        if round_num == 0:
            return None
            
        # Create the comparison content
        comparison_content = f"""# Round {round_num} Response Comparison

## Original Prompt

```
{self.prompt}
```

## Model Responses

"""
        
        # Add each model's response
        for model_name, response in self.responses[round_num].items():
            # Extract metrics
            metrics = response.metrics
            code_lines = metrics.get("code_lines", 0)
            code_size = metrics.get("code_size_kb", 0)
            
            comparison_content += f"### {model_name}\n\n"
            comparison_content += f"**Metrics:** {code_lines} lines, {code_size} KB\n\n"
            comparison_content += "```python\n"
            comparison_content += response.code or "# No code extracted"
            comparison_content += "\n```\n\n"
        
        # Save the comparison file
        comparison_file = self.output_dir / f"markdown_table_prompt_response_comparison__round_{round_num}.md"
        with open(comparison_file, "w", encoding="utf-8") as f:
            f.write(comparison_content)
            
        logger.info(f"Created comparison file for round {round_num}: {comparison_file}")
        return comparison_file

    def update_metrics(self, round_num: int, round_responses: Dict[str, ModelResponse]) -> None:
        """
        Update the tournament metrics with the results from a round.
        
        Args:
            round_num: The round number
            round_responses: Dictionary of model responses from the round
        """
        # Create metrics structure if it doesn't exist
        if "rounds" not in self.metrics:
            self.metrics["rounds"] = {}
        
        # Create metrics for this round
        round_metrics = {
            "timestamp": datetime.now().isoformat(),
            "models": {}
        }
        
        # Add metrics for each model
        for model_name, response in round_responses.items():
            round_metrics["models"][model_name] = response.metrics
        
        # Calculate aggregate metrics
        code_sizes = [r.metrics.get("code_size_kb", 0) for r in round_responses.values()]
        code_lines = [r.metrics.get("code_lines", 0) for r in round_responses.values()]
        
        round_metrics["aggregate"] = {
            "avg_code_size_kb": round(statistics.mean(code_sizes), 2) if code_sizes else 0,
            "avg_code_lines": round(statistics.mean(code_lines), 2) if code_lines else 0,
            "max_code_size_kb": round(max(code_sizes), 2) if code_sizes else 0,
            "max_code_lines": max(code_lines) if code_lines else 0,
            "min_code_size_kb": round(min(code_sizes), 2) if code_sizes else 0,
            "min_code_lines": min(code_lines) if code_lines else 0
        }
        
        # Store the metrics
        self.metrics["rounds"][round_num] = round_metrics
        
        # Save metrics to file
        self.save_metrics()

    def save_metrics(self) -> Path:
        """
        Save the tournament metrics to a JSON file.
        
        Returns:
            Path to the saved metrics file
        """
        metrics_file = self.output_dir / "metrics" / "tournament_metrics.json"
        
        with open(metrics_file, "w", encoding="utf-8") as f:
            json.dump(self.metrics, f, indent=2)
            
        logger.debug(f"Saved metrics to {metrics_file}")
        return metrics_file

    def generate_metrics_report(self) -> Path:
        """
        Generate a detailed metrics report in markdown format.
        
        Returns:
            Path to the generated report
        """
        # Create report content
        report_content = f"""# LLM Tournament Metrics Report

## Tournament Overview

- **Start Time:** {self.metrics.get("start_time", "Unknown")}
- **End Time:** {self.metrics.get("end_time", "Unknown")}
- **Total Rounds:** {self.rounds + 1} (including round 0)
- **Models:** {", ".join(self.models.keys())}

## Round Summaries

"""
        
        # Add metrics for each round
        for round_num in range(self.rounds + 1):
            round_metrics = self.metrics.get("rounds", {}).get(str(round_num), {})
            if not round_metrics:
                continue
                
            report_content += f"### Round {round_num}\n\n"
            
            # Create a table of metrics
            report_content += "| Model | Code Size (KB) | Code Lines | Functions | Complexity |\n"
            report_content += "|-------|---------------|------------|-----------|------------|\n"
            
            for model_name, model_metrics in round_metrics.get("models", {}).items():
                code_size = model_metrics.get("code_size_kb", "N/A")
                code_lines = model_metrics.get("code_lines", "N/A")
                function_count = model_metrics.get("function_count", "N/A")
                complexity = model_metrics.get("complexity_estimate", "N/A")
                
                report_content += f"| {model_name} | {code_size} | {code_lines} | {function_count} | {complexity} |\n"
            
            report_content += "\n"
        
        # Add convergence analysis
        report_content += "## Convergence Analysis\n\n"
        
        # Plot code size and lines over rounds
        report_content += "### Code Size Over Rounds\n\n"
        report_content += "| Round | " + " | ".join(self.models.keys()) + " |\n"
        report_content += "|-------| " + " | ".join(["-" * len(name) for name in self.models.keys()]) + " |\n"
        
        for round_num in range(self.rounds + 1):
            round_metrics = self.metrics.get("rounds", {}).get(str(round_num), {})
            if not round_metrics:
                continue
                
            row = [str(round_num)]
            
            for model_name in self.models.keys():
                model_metrics = round_metrics.get("models", {}).get(model_name, {})
                code_size = model_metrics.get("code_size_kb", "N/A")
                row.append(str(code_size))
                
            report_content += "| " + " | ".join(row) + " |\n"
        
        # Save the report
        report_file = self.output_dir / "metrics" / "tournament_report.md"
        with open(report_file, "w", encoding="utf-8") as f:
            f.write(report_content)
            
        logger.info(f"Generated metrics report: {report_file}")
        return report_file

    def create_test_suite(self) -> Tuple[Path, Path]:
        """
        Create a comprehensive test suite for evaluating all solutions.
        
        Returns:
            Tuple containing paths to the test script and test runner
        """
        # Collect all solution classes
        solution_classes = []
        
        for round_num in range(self.rounds + 1):
            for model_name, response in self.responses.get(round_num, {}).items():
                if not response.code:
                    continue
                    
                # Clean model name for use as a class name
                clean_model_name = re.sub(r'[^a-zA-Z0-9]', '_', model_name)
                class_name = f"{clean_model_name.title()}Round{round_num}Solution"
                
                # With the new LLM-powered extract_code, we should already have a properly formatted class
                # Just add the class directly to our solution_classes list
                solution_classes.append(response.code)
        
        # Create the test script
        test_script = f"""#!/usr/bin/env python3
\"\"\"
LLM Tournament Test Suite

This script tests solutions from all rounds and models on a given input file,
and collects metrics on the results.
\"\"\"

import os
import time
import json
import inspect
import argparse
from pathlib import Path
from typing import Dict, List, Tuple, Any

{os.linesep.join(solution_classes)}

def main():
    \"\"\"Test all solutions and collect metrics\"\"\"
    parser = argparse.ArgumentParser(description="Test LLM tournament solutions")
    parser.add_argument("--input", type=str, required=True, help="Input file to test on")
    parser.add_argument("--output-dir", type=str, default="output_results_for_each_round_and_model",
                    help="Directory for results")
    args = parser.parse_args()
    
    # Create output directory
    output_dir = Path(args.output_dir)
    output_dir.mkdir(exist_ok=True, parents=True)
    
    # Read the input file
    with open(args.input, "r", encoding="utf-8") as f:
        input_text = f.read()
    
    def count_lines(text: str) -> int:
        \"\"\"Count the number of lines in a text\"\"\"
        return len(text.splitlines())

    def count_chars(text: str) -> int:
        \"\"\"Count the number of characters in a text\"\"\"
        return len(text)        
    
    # Get input file metrics
    input_lines = count_lines(input_text)
    input_chars = count_chars(input_text)
    print(f"Input file: {{args.input}}")
    print(f"Input lines: {{input_lines}}")
    print(f"Input chars: {{input_chars}}")
    
    # List of all solution classes
    solution_classes = [
        {", ".join([s.split("class ")[1].split("(")[0].split(":")[0].strip() for s in solution_classes if "class " in s])}
    ]
    
    # Test each solution
    metrics = []
    
    for solution_class in solution_classes:
        class_name = solution_class.__name__
        print(f"\\nTesting {{class_name}}...")

        # Extract model name and round number
        parts = class_name.split("Round")
        model_name = parts[0].replace("_", "-").lower()
        round_num = parts[1].split("Solution")[0]
        
        try:
            # Apply the solution
            start_time = time.time()
            result = solution_class.solve(input_text)
            execution_time = time.time() - start_time
            
            # Save the result
            output_filename = f"sample_file_output__{{model_name}}_round_{{round_num}}.md"
            output_path = output_dir / output_filename
            
            with open(output_path, "w", encoding="utf-8") as f:
                f.write(result)
                
            # Calculate metrics
            output_lines = count_lines(result)
            output_chars = count_chars(result)
            output_size_kb = len(result) / 1024
            
            # Store metrics
            solution_metrics = {{
                "model": model_name,
                "round": round_num,
                "execution_time": round(execution_time, 2),
                "output_lines": output_lines,
                "output_chars": output_chars,
                "output_size_kb": round(output_size_kb, 2),
                "lines_ratio": round(output_lines / input_lines, 2),
                "chars_ratio": round(output_chars / input_chars, 2)
            }}
            
            metrics.append(solution_metrics)
            
            # Print metrics
            print(f"  Execution time: {{solution_metrics['execution_time']}}s")
            print(f"  Output lines: {{output_lines}}")
            print(f"  Output size: {{solution_metrics['output_size_kb']}} KB")
            print(f"  Output saved to: {{output_path}}")
            
        except Exception as e:
            print(f"  Error testing {{class_name}}: {{str(e)}}")

    # Save metrics
    metrics_path = output_dir.parent / "metrics" / "test_metrics.json"
    os.makedirs(os.path.dirname(metrics_path), exist_ok=True)
    
    with open(metrics_path, "w", encoding="utf-8") as f:
        json.dump(metrics, f, indent=2)
        
    print(f"\\nMetrics saved to: {{metrics_path}}")
    
if __name__ == "__main__":
    main()
"""
        
        # Create a test runner script (unchanged)
        test_runner = f"""#!/usr/bin/env python3
\"\"\"
LLM Tournament Test Runner

This script runs the test suite on the provided test file.
\"\"\"

import os
import subprocess
import argparse
from pathlib import Path

def main():
    \"\"\"Run the test suite\"\"\"
    parser = argparse.ArgumentParser(description="Run LLM tournament tests")
    parser.add_argument("--test-file", type=str, required=True, help="File to test on")
    args = parser.parse_args()
    
    # Path to the test script
    test_script = Path(__file__).parent / "test_all_solutions.py"
    
    # Run the test script
    cmd = [
        "python",
        str(test_script),
        "--input", args.test_file,
        "--output-dir", "output_results_for_each_round_and_model"
    ]
    
    print(f"Running: {{' '.join(cmd)}}")
    subprocess.run(cmd, check=True)
    
if __name__ == "__main__":
    main()
"""
        
        # Save the test script
        test_script_path = self.output_dir / "test_all_solutions.py"
        with open(test_script_path, "w", encoding="utf-8") as f:
            f.write(test_script)
        
        # Save the test runner
        test_runner_path = self.output_dir / "run_tests.py"
        with open(test_runner_path, "w", encoding="utf-8") as f:
            f.write(test_runner)
        
        # Make the scripts executable
        os.chmod(test_script_path, 0o755)
        os.chmod(test_runner_path, 0o755)
        
        logger.info(f"Created test suite: {test_script_path}")
        return test_script_path, test_runner_path

    def create_results_analyzer(self) -> Path:
        """
        Create a script to analyze and visualize the tournament results.
        
        Returns:
            Path to the analyzer script
        """
        analyzer_script = """#!/usr/bin/env python3
\"\"\"
LLM Tournament Results Analyzer

This script analyzes and visualizes the results of the LLM tournament.
It creates plots and tables comparing the performance of different models
across rounds.
\"\"\"

import os
import json
import argparse
from pathlib import Path
from typing import Dict, List, Any

# Optional import for visualization
try:
    import matplotlib.pyplot as plt
    import numpy as np
    HAS_MATPLOTLIB = True
except ImportError:
    HAS_MATPLOTLIB = False

def load_metrics(metrics_dir: str) -> Dict[str, Any]:
    \"\"\"Load metrics from JSON files\"\"\"
    metrics_dir = Path(metrics_dir)
    
    # Load tournament metrics
    tournament_metrics_path = metrics_dir / "tournament_metrics.json"
    tournament_metrics = {}
    
    if tournament_metrics_path.exists():
        with open(tournament_metrics_path, "r", encoding="utf-8") as f:
            tournament_metrics = json.load(f)
    
    # Load test metrics
    test_metrics_path = metrics_dir / "test_metrics.json"
    test_metrics = []
    
    if test_metrics_path.exists():
        with open(test_metrics_path, "r", encoding="utf-8") as f:
            test_metrics = json.load(f)
    
    return {
        "tournament": tournament_metrics,
        "test": test_metrics
    }
    
def generate_markdown_report(metrics: Dict[str, Any], output_file: str) -> None:
    \"\"\"Generate a markdown report from the metrics\"\"\"
    tournament_metrics = metrics.get("tournament", {})
    test_metrics = metrics.get("test", [])
    
    # Start building the report
    report = "# LLM Tournament Results\\n\\n"
    report += "## Overview\\n\\n"
    report += "This report summarizes the results of the LLM tournament.\\n\\n"
    
    # Add test metrics table if available
    if test_metrics:
        report += "## Test Results\\n\\n"
        report += "| Model | Round | Execution Time (s) | Output Lines | Output Size (KB) |\\n"
        report += "|-------|-------|-------------------|--------------|------------------|\\n"
        
        # Sort by round, then model
        sorted_metrics = sorted(test_metrics, key=lambda x: (int(x.get("round", 0)), x.get("model", "")))
        
        for metric in sorted_metrics:
            model = metric.get("model", "Unknown")
            round_num = metric.get("round", "Unknown")
            time = metric.get("execution_time", "N/A")
            lines = metric.get("output_lines", "N/A")
            size = metric.get("output_size_kb", "N/A")
            
            report += f"| {model} | {round_num} | {time} | {lines} | {size} |\\n"
            
    # Add round metrics if available
    rounds_data = tournament_metrics.get("rounds", {})
    if rounds_data:
        report += "\\n## Code Metrics by Round\\n\\n"
        report += "| Round | Model | Code Size (KB) | Code Lines |\\n"
        report += "|-------|-------|---------------|------------|\\n"
        
        for round_num, round_data in sorted(rounds_data.items(), key=lambda x: int(x[0])):
            models_data = round_data.get("models", {})
            
            for model, model_data in sorted(models_data.items()):
                code_size = model_data.get("code_size_kb", "N/A")
                code_lines = model_data.get("code_lines", "N/A")
                
                report += f"| {round_num} | {model} | {code_size} | {code_lines} |\\n"
    
    # Save the report
    with open(output_file, "w", encoding="utf-8") as f:
        f.write(report)
    
    print(f"Report saved to: {output_file}")

def create_visualizations(metrics: Dict[str, Any], output_dir: str) -> None:
    \"\"\"Create visualizations of the metrics\"\"\"
    if not HAS_MATPLOTLIB:
        print("Matplotlib not available. Skipping visualizations.")
        return
    
    output_dir = Path(output_dir)
    output_dir.mkdir(exist_ok=True, parents=True)
    
    tournament_metrics = metrics.get("tournament", {})
    test_metrics = metrics.get("test", [])
    
    # Extract data for plotting
    rounds_data = tournament_metrics.get("rounds", {})
    if not rounds_data:
        return
    
    # Create a plot of code size by round and model
    plt.figure(figsize=(10, 6))
    
    # Extract data
    models = set()
    round_nums = []
    data_by_model = {}
    
    for round_num, round_data in sorted(rounds_data.items(), key=lambda x: int(x[0])):
        round_nums.append(int(round_num))
        models_data = round_data.get("models", {})
        
        for model, model_data in models_data.items():
            models.add(model)
            if model not in data_by_model:
                data_by_model[model] = []
            
            code_size = model_data.get("code_size_kb", 0)
            data_by_model[model].append(code_size)
    
    # Plot code size by round for each model
    for model, sizes in data_by_model.items():
        # Ensure all models have data for all rounds
        while len(sizes) < len(round_nums):
            sizes.append(None)
        
        plt.plot(round_nums, sizes, marker='o', label=model)
    
    plt.xlabel('Round')
    plt.ylabel('Code Size (KB)')
    plt.title('Code Size by Round and Model')
    plt.legend()
    plt.grid(True, linestyle='--', alpha=0.7)
    
    # Save the plot
    plt.savefig(output_dir / "code_size_by_round.png")
    
    # Create a plot of execution time by round and model
    if test_metrics:
        plt.figure(figsize=(10, 6))
        
        # Extract data
        data_by_model = {}
        
        for metric in test_metrics:
            model = metric.get("model", "Unknown")
            round_num = int(metric.get("round", 0))
            time = metric.get("execution_time", 0)
            
            if model not in data_by_model:
                data_by_model[model] = []
            
            # Ensure the list is long enough
            while len(data_by_model[model]) <= round_num:
                data_by_model[model].append(None)
            
            data_by_model[model][round_num] = time
        
        # Plot execution time by round for each model
        for model, times in data_by_model.items():
            rounds = list(range(len(times)))
            plt.plot(rounds, times, marker='o', label=model)
        
        plt.xlabel('Round')
        plt.ylabel('Execution Time (s)')
        plt.title('Execution Time by Round and Model')
        plt.legend()
        plt.grid(True, linestyle='--', alpha=0.7)
        
        # Save the plot
        plt.savefig(output_dir / "execution_time_by_round.png")
    
    print(f"Visualizations saved to: {{output_dir}}")

def main():
    \"\"\"Main function\"\"\"
    parser = argparse.ArgumentParser(description="Analyze LLM tournament results")
    parser.add_argument("--metrics-dir", type=str, default="metrics",
                     help="Directory containing metrics files")
    parser.add_argument("--output-dir", type=str, default="analysis",
                     help="Directory for output files")
    args = parser.parse_args()
    
    # Create output directory
    output_dir = Path(args.output_dir)
    output_dir.mkdir(exist_ok=True, parents=True)
    
    # Load metrics
    metrics = load_metrics(args.metrics_dir)
    
    # Generate markdown report
    report_path = output_dir / "tournament_results_report.md"
    generate_markdown_report(metrics, report_path)
    
    # Create visualizations
    create_visualizations(metrics, output_dir)

if __name__ == "__main__":
    main()
"""
        
        # Save the analyzer script
        analyzer_path = self.output_dir / "analyze_results.py"
        with open(analyzer_path, "w", encoding="utf-8") as f:
            f.write(analyzer_script)
        
        # Make the script executable
        os.chmod(analyzer_path, 0o755)
        
        logger.info(f"Created results analyzer: {analyzer_path}")
        return analyzer_path

    def run_tournament(self) -> Dict[str, List[ModelResponse]]:
        """
        Run the complete tournament through all rounds.
        
        Returns:
            Dictionary mapping model names to lists of their responses for each round
        """
        # Initialize metrics
        self.metrics["start_time"] = datetime.now().isoformat()
        self.metrics["models"] = list(self.models.keys())
        self.metrics["rounds"] = {}
        
        # Run round 0 (initial solutions)
        logger.info("Starting tournament with round 0 (initial solutions)")
        self.run_round(0)
        
        # Run subsequent rounds
        for round_num in range(1, self.rounds + 1):
            self.run_round(round_num)
            
            # Give a short break between rounds to avoid rate limiting
            if round_num < self.rounds:
                logger.info(f"Completed round {round_num}. Waiting before starting next round...")
                time.sleep(3)
        
        # Record end time
        self.metrics["end_time"] = datetime.now().isoformat()
        
        # Create final artifacts
        logger.info("Tournament completed. Generating final artifacts...")
        
        # Save final metrics
        self.save_metrics()
        
        # Generate metrics report
        self.generate_metrics_report()
        
        # Create test suite
        self.create_test_suite()
        
        # Create results analyzer
        self.create_results_analyzer()
        
        # Organize responses by model
        results = {model_name: [] for model_name in self.models.keys()}
        
        for round_num in range(self.rounds + 1):
            for model_name, response in self.responses.get(round_num, {}).items():
                results[model_name].append(response)
        
        logger.info(f"Tournament completed successfully with {self.rounds + 1} rounds")
        
        return results

    def run_tests(self, test_file: Optional[str] = None) -> None:
        """
        Run tests on all generated solutions.
        
        Args:
            test_file: Path to the file to test on (defaults to self.test_file)
        """
        if not test_file and not self.test_file:
            logger.warning("No test file provided. Skipping tests.")
            return
        
        test_file = test_file or self.test_file
        
        # Ensure the test file exists
        if not os.path.exists(test_file):
            logger.error(f"Test file not found: {test_file}")
            return
        
        # Get the test runner script
        test_runner = self.output_dir / "run_tests.py"
        
        if not test_runner.exists():
            logger.warning("Test runner not found. Creating test suite...")
            self.create_test_suite()
        
        # Run the tests
        logger.info(f"Running tests on {test_file}...")
        
        try:
            cmd = ["python", str(test_runner), "--test-file", test_file]
            subprocess.run(cmd, check=True)
            logger.info("Tests completed successfully")
        except subprocess.CalledProcessError as e:
            logger.error(f"Error running tests: {str(e)}")
    
    def analyze_results(self) -> None:
        """Run the results analyzer to generate reports and visualizations"""
        # Get the analyzer script
        analyzer = self.output_dir / "analyze_results.py"
        
        if not analyzer.exists():
            logger.warning("Results analyzer not found. Creating analyzer...")
            self.create_results_analyzer()
        
        # Run the analyzer
        logger.info("Analyzing tournament results...")
        
        try:
            cmd = [
                "python", 
                str(analyzer), 
                "--metrics-dir", str(self.output_dir / "metrics"),
                "--output-dir", str(self.output_dir / "analysis")
            ]
            subprocess.run(cmd, check=True)
            logger.info("Analysis completed successfully")
        except subprocess.CalledProcessError as e:
            logger.error(f"Error analyzing results: {str(e)}")


def main():
    """
    Main function to run the LLM tournament from the command line.
    
    This function parses command-line arguments, initializes the tournament,
    and runs it with the specified options.
    """
    parser = argparse.ArgumentParser(
        description="Run a multi-round LLM coding tournament",
        formatter_class=argparse.ArgumentDefaultsHelpFormatter
    )
    
    # Required arguments
    parser.add_argument(
        "--prompt", 
        type=str, 
        required=True,
        help="File containing the coding challenge prompt"
    )
    
    # Optional arguments
    parser.add_argument(
        "--rounds", 
        type=int, 
        default=5,
        help="Number of tournament rounds (not including round 0)"
    )
    parser.add_argument(
        "--output-dir", 
        type=str, 
        default="tournament_results",
        help="Directory for tournament results"
    )
    parser.add_argument(
        "--test-file", 
        type=str,
        help="File to use for testing solutions"
    )
    parser.add_argument(
        "--temperature", 
        type=float, 
        default=0.7,
        help="Temperature for LLM generation (0.0-1.0)"
    )
    parser.add_argument(
        "--concurrent-requests", 
        type=int, 
        default=4,
        help="Maximum number of concurrent API requests"
    )
    parser.add_argument(
        "--skip-tests", 
        action="store_true",
        help="Skip running tests on the solutions"
    )
    parser.add_argument(
        "--verbose", 
        action="store_true",
        help="Enable verbose logging"
    )
    
    # Parse arguments
    args = parser.parse_args()
    
    # Validate arguments
    if not os.path.exists(args.prompt):
        parser.error(f"Prompt file not found: {args.prompt}")
    
    if args.test_file and not os.path.exists(args.test_file):
        parser.error(f"Test file not found: {args.test_file}")
    
    if args.rounds < 1:
        parser.error("Number of rounds must be at least 1")
    
    if args.temperature < 0.0 or args.temperature > 1.0:
        parser.error("Temperature must be between 0.0 and 1.0")
    
    # Load the prompt from file
    with open(args.prompt, "r", encoding="utf-8") as f:
        prompt_text = f.read()
    
    # Display startup banner
    print(r"""

   __    __              _____                                                  _   
  / /   / /   /\/\      /__   \___  _   _ _ __ _ __   __ _ _ __ ___   ___ _ __ | |_ 
 / /   / /   /    \ _____ / /\/ _ \| | | | '__| '_ \ / _` | '_ ` _ \ / _ \ '_ \| __|
/ /___/ /___/ /\/\ \_____/ / | (_) | |_| | |  | | | | (_| | | | | | |  __/ | | | |_ 
\____/\____/\/    \/     \/   \___/ \__,_|_|  |_| |_|\__,_|_| |_| |_|\___|_| |_|\__|
                                                                                    
                                                                                 
""")
    print(f"Starting LLM Tournament with {len(MODELS)} models for {args.rounds} rounds")
    print(f"Output directory: {args.output_dir}")
    print(f"Models: {', '.join(MODELS.keys())}")
    print()
    
    # Initialize the tournament
    tournament = LLMTournament(
        prompt=prompt_text,
        rounds=args.rounds,
        output_dir=args.output_dir,
        temperature=args.temperature,
        concurrent_requests=args.concurrent_requests,
        test_file=args.test_file,
        verbose=args.verbose
    )
    
    # Run the tournament
    try:
        tournament.run_tournament()
        
        # Run tests if a test file is provided and tests are not skipped
        if args.test_file and not args.skip_tests:
            tournament.run_tests(args.test_file)
        
        # Analyze results
        tournament.analyze_results()
        
        # Print final message
        print("\nTournament completed successfully!")
        print(f"Results are available in: {args.output_dir}")
        
    except KeyboardInterrupt:
        print("\nTournament interrupted by user")
        sys.exit(1)
    except Exception as e:
        print(f"\nError running tournament: {str(e)}")
        traceback.print_exc()
        sys.exit(1)


if __name__ == "__main__":
    import sys
    main()