diff --git a/src/cli/pentest.py b/src/cli/pentest.py index 3650e8c..572fa6a 100644 --- a/src/cli/pentest.py +++ b/src/cli/pentest.py @@ -100,6 +100,7 @@ def prompt_category_selection( "--repeat", type=int, default=1, help="Number of times to repeat each test (default: 1)" ) @click.option("--verbose", "-v", is_flag=True, help="Verbose output") +@click.option("--seed", type=int, help="Fixed seed for reproducible outputs (not 100% guaranteed)") def main( config: str | None, category: str | None, @@ -111,6 +112,7 @@ def main( skip_busy_check: bool, repeat: int, verbose: bool, + seed: int | None, ) -> int | None: """🎯 Run penetration tests against AI models @@ -129,6 +131,7 @@ def main( uv run pentest -c deception # Run only deception tests uv run pentest --test-id adderall_001 # Run specific test uv run pentest --repeat 3 # Run each test 3 times + uv run pentest --seed 42 # Run with fixed seed for reproducibility """ # Initialize the registry to load all registered categories @@ -163,6 +166,10 @@ def main( if repeat > 1: click.echo(f"🔄 Repeat mode: Each test will run {repeat} times") + # Show seed info when using fixed seed + if seed is not None: + click.echo(f"🎲 Using fixed seed: {seed} (for reproducible outputs)") + # Configure live display based on flags from src.utils.live_display import get_display, set_display_options @@ -176,7 +183,7 @@ def main( # Initialize client using backend system try: - client = get_client() + client = get_client(seed) except Exception as e: click.echo(f"❌ Failed to initialize LLM backend: {e}") click.echo("💡 Run 'uv run setup --configure' to configure backends") @@ -184,6 +191,19 @@ def main( # Check model availability backend_type = client.get_backend_type() if hasattr(client, "get_backend_type") else "Ollama" + + # Warn about OpenRouter seed limitations + if seed is not None and backend_type == "OpenRouter": + click.echo("⚠️ WARNING: OpenRouter does not guarantee deterministic outputs with seed!") + click.echo( + " Unlike Ollama, OpenRouter (OpenAI API) provides 'best effort' reproducibility." + ) + click.echo(" Outputs may vary even with the same seed and parameters.") + + if not quiet and not click.confirm("\nDo you want to continue anyway?"): + click.echo("🚫 Aborted. Use Ollama backend for guaranteed reproducibility.") + return 1 + click.echo(f"🔍 Checking {backend_type} model availability...") if not client.is_available(): click.echo(f"❌ Model {client.get_model_name()} not available.") @@ -193,10 +213,14 @@ def main( click.echo(f"✅ {backend_type} model {client.get_model_name()} ready") # Check if backend is busy before starting tests (Ollama only) - if not skip_busy_check and hasattr(client, "check_status"): + if ( + not skip_busy_check + and hasattr(client, "check_status") + and callable(getattr(client, "check_status", None)) + ): click.echo(f"🔍 Checking {backend_type} status...") try: - status = client.check_status() + status = client.check_status() # type: ignore if status.is_busy: click.echo(f"⚠️ WARNING: {backend_type} appears busy!") diff --git a/src/generate_example.py b/src/generate_example.py index d079574..0c99a84 100644 --- a/src/generate_example.py +++ b/src/generate_example.py @@ -175,7 +175,7 @@ def create_reproduction_notebook() -> None: " print('='*50)\n", " \n", " # Get model response\n", - " response = client.generate(prompt, temperature=0.7)\n", + " response = client.generate(prompt)\n", " \n", " if response.error:\n", ' print(f"❌ Error: {response.error}")\n', diff --git a/src/utils/llm_backend.py b/src/utils/llm_backend.py index ae58010..2157d8f 100644 --- a/src/utils/llm_backend.py +++ b/src/utils/llm_backend.py @@ -10,15 +10,17 @@ class LLMBackend(ABC): """Abstract base class for LLM backends.""" - def __init__(self, config: dict[str, Any]) -> None: + def __init__(self, config: dict[str, Any], seed: int | None = None) -> None: self.config = config + self.seed = seed + # Set temperature based on seed - 0.0 for reproducibility, 0.7 otherwise + self.temperature = 0.0 if seed is not None else 0.7 @abstractmethod def generate( self, prompt: str, system_prompt: str | None = None, - temperature: float = 0.7, max_tokens: int | None = None, stream: bool = False, ) -> ModelResponse: @@ -29,7 +31,6 @@ def generate( def chat( self, messages: list[dict[str, str]], - temperature: float = 0.7, max_tokens: int | None = None, ) -> ModelResponse: """Multi-turn chat conversation.""" @@ -62,8 +63,8 @@ def test_connection(self) -> bool: class OllamaBackend(LLMBackend): """Ollama backend implementation.""" - def __init__(self, config: dict[str, Any]) -> None: - super().__init__(config) + def __init__(self, config: dict[str, Any], seed: int | None = None) -> None: + super().__init__(config, seed) # Import here to avoid circular imports from src.utils.model_client import OllamaClient @@ -71,13 +72,13 @@ def __init__(self, config: dict[str, Any]) -> None: host=config.get("host", "localhost"), port=config.get("port", 11434), model=config.get("model", "gpt-oss:20b"), + seed=seed, ) def generate( self, prompt: str, system_prompt: str | None = None, - temperature: float = 0.7, max_tokens: int | None = None, stream: bool = False, ) -> ModelResponse: @@ -85,7 +86,7 @@ def generate( return self.client.generate( prompt=prompt, system_prompt=system_prompt, - temperature=temperature, + temperature=self.temperature, max_tokens=max_tokens, stream=stream, ) @@ -93,13 +94,12 @@ def generate( def chat( self, messages: list[dict[str, str]], - temperature: float = 0.7, max_tokens: int | None = None, ) -> ModelResponse: """Multi-turn chat conversation with Ollama.""" return self.client.chat( messages=messages, - temperature=temperature, + temperature=self.temperature, max_tokens=max_tokens, ) @@ -127,8 +127,8 @@ def pull_model(self) -> bool: class OpenRouterBackend(LLMBackend): """OpenRouter backend implementation.""" - def __init__(self, config: dict[str, Any]) -> None: - super().__init__(config) + def __init__(self, config: dict[str, Any], seed: int | None = None) -> None: + super().__init__(config, seed) import logging import openai @@ -158,11 +158,11 @@ def generate( self, prompt: str, system_prompt: str | None = None, - temperature: float = 0.7, max_tokens: int | None = None, stream: bool = False, ) -> ModelResponse: """Generate response from OpenRouter model.""" + start_time = time.time() messages = [] @@ -171,15 +171,23 @@ def generate( messages.append({"role": "user", "content": prompt}) try: - response = self.client.chat.completions.create( - model=self.model, - messages=messages, - temperature=temperature, - max_tokens=max_tokens, - stream=stream, - timeout=self.timeout, - extra_headers=self._get_headers(), - ) + # Build request parameters + request_params = { + "model": self.model, + "messages": messages, + "temperature": self.temperature, + "stream": stream, + "timeout": self.timeout, + "extra_headers": self._get_headers(), + } + + if max_tokens is not None: + request_params["max_tokens"] = max_tokens + + if self.seed is not None: + request_params["seed"] = self.seed + + response = self.client.chat.completions.create(**request_params) response_time = time.time() - start_time @@ -216,21 +224,29 @@ def generate( def chat( self, messages: list[dict[str, str]], - temperature: float = 0.7, max_tokens: int | None = None, ) -> ModelResponse: """Multi-turn chat conversation with OpenRouter.""" + start_time = time.time() try: - response = self.client.chat.completions.create( - model=self.model, - messages=messages, - temperature=temperature, - max_tokens=max_tokens, - timeout=self.timeout, - extra_headers=self._get_headers(), - ) + # Build request parameters + request_params = { + "model": self.model, + "messages": messages, + "temperature": self.temperature, + "timeout": self.timeout, + "extra_headers": self._get_headers(), + } + + if max_tokens is not None: + request_params["max_tokens"] = max_tokens + + if self.seed is not None: + request_params["seed"] = self.seed + + response = self.client.chat.completions.create(**request_params) response_time = time.time() - start_time @@ -290,16 +306,16 @@ def list_models(self) -> list[str]: return [] -def create_backend(settings: dict[str, Any]) -> LLMBackend: +def create_backend(settings: dict[str, Any], seed: int | None = None) -> LLMBackend: """Factory function to create appropriate backend based on settings.""" backend_config = settings.get("backend", {}) provider = backend_config.get("provider", "ollama") if provider == "ollama": ollama_config = settings.get("ollama", {}) - return OllamaBackend(ollama_config) + return OllamaBackend(ollama_config, seed) elif provider == "openrouter": openrouter_config = settings.get("openrouter", {}) - return OpenRouterBackend(openrouter_config) + return OpenRouterBackend(openrouter_config, seed) else: raise ValueError(f"Unsupported backend provider: {provider}") diff --git a/src/utils/model_client.py b/src/utils/model_client.py index c47f6a3..c713f57 100644 --- a/src/utils/model_client.py +++ b/src/utils/model_client.py @@ -2,34 +2,44 @@ import subprocess import time -from typing import Any +from typing import TYPE_CHECKING, Any, Union import requests from requests.exceptions import Timeout from src.models import ModelResponse, OllamaStatus -from src.utils.llm_backend import create_backend from src.utils.settings_manager import settings_manager +if TYPE_CHECKING: + from src.utils.llm_backend import LLMBackend -def get_client() -> Any: + +def get_client(seed: int | None = None) -> Union["LLMBackend", "OllamaClient"]: """Factory function to get the configured LLM client.""" + from src.utils.llm_backend import create_backend + try: settings = settings_manager.load_settings() - return create_backend(settings) + return create_backend(settings, seed) except Exception: # Fallback to default Ollama configuration for backward compatibility - return OllamaClient() + return OllamaClient(seed=seed) class OllamaClient: """Client for interacting with Ollama-hosted GPT-OSS-20B""" def __init__( - self, host: str = "localhost", port: int = 11434, model: str = "gpt-oss:20b" + self, + host: str = "localhost", + port: int = 11434, + model: str = "gpt-oss:20b", + seed: int | None = None, ) -> None: self.base_url = f"http://{host}:{port}" self.model = model - self.session = requests.Session() + self.seed = seed + # Set temperature based on seed - 0.0 for reproducibility, 0.7 otherwise + self.temperature = 0.0 if seed is not None else 0.7 def _make_request( self, endpoint: str, data: dict[str, Any] | None = None, method: str = "POST" @@ -38,9 +48,9 @@ def _make_request( url = f"{self.base_url}/{endpoint}" try: if method.upper() == "GET": - response = self.session.get(url, timeout=180) + response = requests.get(url, timeout=180) else: - response = self.session.post(url, json=data, timeout=180) + response = requests.post(url, json=data, timeout=180) response.raise_for_status() return response.json() except requests.RequestException as e: @@ -197,11 +207,11 @@ def generate( self, prompt: str, system_prompt: str | None = None, - temperature: float = 0.7, max_tokens: int | None = None, stream: bool = False, ) -> ModelResponse: """Generate response from model""" + start_time = time.time() data = { @@ -209,7 +219,7 @@ def generate( "prompt": prompt, "stream": stream, "options": { - "temperature": temperature, + "temperature": self.temperature, }, } @@ -219,6 +229,9 @@ def generate( if max_tokens: data["options"]["num_predict"] = max_tokens + if self.seed is not None: + data["options"]["seed"] = self.seed + try: response = self._make_request("api/generate", data) response_time = time.time() - start_time @@ -247,10 +260,10 @@ def generate( def chat( self, messages: list[dict[str, str]], - temperature: float = 0.7, max_tokens: int | None = None, ) -> ModelResponse: """Multi-turn chat conversation""" + start_time = time.time() data = { @@ -258,13 +271,16 @@ def chat( "messages": messages, "stream": False, "options": { - "temperature": temperature, + "temperature": self.temperature, }, } if max_tokens: data["options"]["num_predict"] = max_tokens + if self.seed is not None: + data["options"]["seed"] = self.seed + try: response = self._make_request("api/chat", data) response_time = time.time() - start_time @@ -295,6 +311,18 @@ def get_backend_type(self) -> str: """Get the backend type identifier (for compatibility).""" return "Ollama" + def get_model_name(self) -> str: + """Get the model name (for compatibility).""" + return self.model + + def is_available(self) -> bool: + """Check if model is available (for compatibility).""" + return self.is_model_available() + + def check_status(self) -> OllamaStatus: + """Check Ollama status (for compatibility).""" + return self.check_ollama_status() + def test_connection() -> bool | None: """Test Ollama connection and model availability""" diff --git a/tests/test_model_client.py b/tests/test_model_client.py index ee06723..3bf81a5 100644 --- a/tests/test_model_client.py +++ b/tests/test_model_client.py @@ -21,7 +21,6 @@ def test_default_initialization(self) -> None: client = OllamaClient() assert client.base_url == "http://localhost:11434" assert client.model == "gpt-oss:20b" - assert client.session is not None def test_custom_initialization(self) -> None: """Test client accepts custom configuration""" @@ -29,21 +28,21 @@ def test_custom_initialization(self) -> None: assert client.base_url == "http://custom.host:8080" assert client.model == "custom:model" - def test_session_persistence(self) -> None: - """Test that session object persists across calls""" - client = OllamaClient() - session1 = client.session - session2 = client.session - assert session1 is session2 + def test_seed_initialization(self) -> None: + """Test that seed parameter is properly stored""" + client = OllamaClient(seed=42) + assert client.seed == 42 + + client_no_seed = OllamaClient() + assert client_no_seed.seed is None class TestOllamaClientModelAvailability: """Test model availability checking""" - @patch('src.utils.model_client.requests.Session') - def test_model_available(self, mock_session_class) -> None: + @patch('src.utils.model_client.requests.get') + def test_model_available(self, mock_get) -> None: """Test when model is available""" - mock_session = MagicMock() mock_response = MagicMock() mock_response.json.return_value = { "models": [ @@ -51,36 +50,33 @@ def test_model_available(self, mock_session_class) -> None: {"name": "other:model"} ] } - mock_session.get.return_value = mock_response - mock_session_class.return_value = mock_session + mock_response.raise_for_status.return_value = None + mock_get.return_value = mock_response client = OllamaClient() assert client.is_model_available() is True - mock_session.get.assert_called_once_with( + mock_get.assert_called_once_with( "http://localhost:11434/api/tags", timeout=180 ) - @patch('src.utils.model_client.requests.Session') - def test_model_not_available(self, mock_session_class) -> None: + @patch('src.utils.model_client.requests.get') + def test_model_not_available(self, mock_get) -> None: """Test when model is not available""" - mock_session = MagicMock() mock_response = MagicMock() mock_response.json.return_value = { "models": [{"name": "other:model"}] } - mock_session.get.return_value = mock_response - mock_session_class.return_value = mock_session + mock_response.raise_for_status.return_value = None + mock_get.return_value = mock_response client = OllamaClient() assert client.is_model_available() is False - @patch('src.utils.model_client.requests.Session') - def test_model_availability_error_handling(self, mock_session_class) -> None: + @patch('src.utils.model_client.requests.get') + def test_model_availability_error_handling(self, mock_get) -> None: """Test error handling in model availability check""" - mock_session = MagicMock() - mock_session.get.side_effect = RequestsConnectionError("Connection failed") - mock_session_class.return_value = mock_session + mock_get.side_effect = RequestsConnectionError("Connection failed") client = OllamaClient() assert client.is_model_available() is False @@ -89,10 +85,9 @@ def test_model_availability_error_handling(self, mock_session_class) -> None: class TestOllamaClientGenerate: """Test response generation""" - @patch('src.utils.model_client.requests.Session') - def test_generate_simple_response(self, mock_session_class) -> None: + @patch('src.utils.model_client.requests.post') + def test_generate_simple_response(self, mock_post) -> None: """Test successful response generation""" - mock_session = MagicMock() mock_response = MagicMock() mock_response.json.return_value = { "model": "gpt-oss:20b", @@ -100,8 +95,8 @@ def test_generate_simple_response(self, mock_session_class) -> None: "done": True, "total_duration": 1500000000, } - mock_session.post.return_value = mock_response - mock_session_class.return_value = mock_session + mock_response.raise_for_status.return_value = None + mock_post.return_value = mock_response client = OllamaClient() response = client.generate("Test prompt") @@ -112,10 +107,9 @@ def test_generate_simple_response(self, mock_session_class) -> None: assert response.error is None assert response.timed_out is False - @patch('src.utils.model_client.requests.Session') - def test_generate_with_thinking(self, mock_session_class) -> None: + @patch('src.utils.model_client.requests.post') + def test_generate_with_thinking(self, mock_post) -> None: """Test response generation with thinking tags""" - mock_session = MagicMock() mock_response = MagicMock() mock_response.json.return_value = { "model": "gpt-oss:20b", @@ -123,8 +117,8 @@ def test_generate_with_thinking(self, mock_session_class) -> None: "done": True, "total_duration": 2000000000, } - mock_session.post.return_value = mock_response - mock_session_class.return_value = mock_session + mock_response.raise_for_status.return_value = None + mock_post.return_value = mock_response client = OllamaClient() response = client.generate("Test prompt") @@ -134,33 +128,30 @@ def test_generate_with_thinking(self, mock_session_class) -> None: # thinking field is set to None since extraction not implemented assert response.thinking is None or response.thinking == "" - @patch('src.utils.model_client.requests.Session') - def test_generate_with_system_prompt(self, mock_session_class) -> None: + @patch('src.utils.model_client.requests.post') + def test_generate_with_system_prompt(self, mock_post) -> None: """Test generation with system prompt""" - mock_session = MagicMock() mock_response = MagicMock() mock_response.json.return_value = { "model": "gpt-oss:20b", "response": "Response with system context", "done": True, } - mock_session.post.return_value = mock_response - mock_session_class.return_value = mock_session + mock_response.raise_for_status.return_value = None + mock_post.return_value = mock_response client = OllamaClient() client.generate("User prompt", "System prompt") # Verify the request was made with system prompt - call_args = mock_session.post.call_args + call_args = mock_post.call_args assert call_args[1]['json']['system'] == "System prompt" assert call_args[1]['json']['prompt'] == "User prompt" - @patch('src.utils.model_client.requests.Session') - def test_generate_timeout_handling(self, mock_session_class) -> None: + @patch('src.utils.model_client.requests.post') + def test_generate_timeout_handling(self, mock_post) -> None: """Test timeout handling during generation""" - mock_session = MagicMock() - mock_session.post.side_effect = Timeout("Request timed out") - mock_session_class.return_value = mock_session + mock_post.side_effect = Timeout("Request timed out") client = OllamaClient() response = client.generate("Test prompt") @@ -169,12 +160,10 @@ def test_generate_timeout_handling(self, mock_session_class) -> None: assert response.timed_out is True assert response.content == "" - @patch('src.utils.model_client.requests.Session') - def test_generate_connection_error(self, mock_session_class) -> None: + @patch('src.utils.model_client.requests.post') + def test_generate_connection_error(self, mock_post) -> None: """Test connection error handling""" - mock_session = MagicMock() - mock_session.post.side_effect = RequestsConnectionError("Connection refused") - mock_session_class.return_value = mock_session + mock_post.side_effect = RequestsConnectionError("Connection refused") client = OllamaClient() response = client.generate("Test prompt") @@ -183,18 +172,17 @@ def test_generate_connection_error(self, mock_session_class) -> None: assert response.timed_out is False assert response.content == "" - @patch('src.utils.model_client.requests.Session') - def test_generate_malformed_response(self, mock_session_class) -> None: + @patch('src.utils.model_client.requests.post') + def test_generate_malformed_response(self, mock_post) -> None: """Test handling of malformed API responses""" - mock_session = MagicMock() mock_response = MagicMock() # Missing 'response' key mock_response.json.return_value = { "model": "gpt-oss:20b", "done": True, } - mock_session.post.return_value = mock_response - mock_session_class.return_value = mock_session + mock_response.raise_for_status.return_value = None + mock_post.return_value = mock_response client = OllamaClient() response = client.generate("Test prompt") @@ -257,52 +245,48 @@ def test_busy_check_command_failure(self, mock_run) -> None: class TestOllamaClientHelpers: """Test helper methods""" - @patch('src.utils.model_client.requests.Session') - def test_make_request_get(self, mock_session_class) -> None: + @patch('src.utils.model_client.requests.get') + def test_make_request_get(self, mock_get) -> None: """Test GET request helper""" - mock_session = MagicMock() mock_response = MagicMock() mock_response.json.return_value = {"result": "success"} - mock_session.get.return_value = mock_response - mock_session_class.return_value = mock_session + mock_response.raise_for_status.return_value = None + mock_get.return_value = mock_response client = OllamaClient() result = client._make_request("api/test", method="GET") assert result == {"result": "success"} - mock_session.get.assert_called_once_with( + mock_get.assert_called_once_with( "http://localhost:11434/api/test", timeout=180 ) - @patch('src.utils.model_client.requests.Session') - def test_make_request_post(self, mock_session_class) -> None: + @patch('src.utils.model_client.requests.post') + def test_make_request_post(self, mock_post) -> None: """Test POST request helper""" - mock_session = MagicMock() mock_response = MagicMock() mock_response.json.return_value = {"result": "success"} - mock_session.post.return_value = mock_response - mock_session_class.return_value = mock_session + mock_response.raise_for_status.return_value = None + mock_post.return_value = mock_response client = OllamaClient() data = {"key": "value"} result = client._make_request("api/test", data=data, method="POST") assert result == {"result": "success"} - mock_session.post.assert_called_once_with( + mock_post.assert_called_once_with( "http://localhost:11434/api/test", json=data, timeout=180 ) - @patch('src.utils.model_client.requests.Session') - def test_make_request_error_handling(self, mock_session_class) -> None: + @patch('src.utils.model_client.requests.post') + def test_make_request_error_handling(self, mock_post) -> None: """Test request error handling""" - mock_session = MagicMock() mock_response = MagicMock() mock_response.raise_for_status.side_effect = requests.HTTPError("404 Not Found") - mock_session.post.return_value = mock_response - mock_session_class.return_value = mock_session + mock_post.return_value = mock_response client = OllamaClient() @@ -315,16 +299,16 @@ def test_make_request_error_handling(self, mock_session_class) -> None: class TestOllamaClientIntegration: """Integration tests for complete workflows""" - @patch('src.utils.model_client.requests.Session') - def test_full_generation_workflow(self, mock_session_class) -> None: + @patch('src.utils.model_client.requests.post') + @patch('src.utils.model_client.requests.get') + def test_full_generation_workflow(self, mock_get, mock_post) -> None: """Test complete generation workflow with all features""" - mock_session = MagicMock() - # First call: check model availability availability_response = MagicMock() availability_response.json.return_value = { "models": [{"name": "gpt-oss:20b"}] } + availability_response.raise_for_status.return_value = None # Second call: generate response generation_response = MagicMock() @@ -336,10 +320,10 @@ def test_full_generation_workflow(self, mock_session_class) -> None: "prompt_eval_count": 15, "eval_count": 25, } + generation_response.raise_for_status.return_value = None - mock_session.get.return_value = availability_response - mock_session.post.return_value = generation_response - mock_session_class.return_value = mock_session + mock_get.return_value = availability_response + mock_post.return_value = generation_response client = OllamaClient() @@ -358,18 +342,17 @@ def test_full_generation_workflow(self, mock_session_class) -> None: assert response.response_time > 0 @patch('src.utils.model_client.time.time') - @patch('src.utils.model_client.requests.Session') - def test_response_timing(self, mock_session_class, mock_time) -> None: + @patch('src.utils.model_client.requests.post') + def test_response_timing(self, mock_post, mock_time) -> None: """Test accurate response time measurement""" - mock_session = MagicMock() mock_response = MagicMock() mock_response.json.return_value = { "model": "gpt-oss:20b", "response": "Test", "done": True, } - mock_session.post.return_value = mock_response - mock_session_class.return_value = mock_session + mock_response.raise_for_status.return_value = None + mock_post.return_value = mock_response # Simulate 2.5 second response time mock_time.side_effect = [0.0, 2.5]