|
| 1 | +# pylint: disable=redefined-outer-name,too-many-arguments,too-many-positional-arguments,import-outside-toplevel |
| 2 | +"""End-to-End Integration tests for LightSpeed Evaluation Framework. |
| 3 | +
|
| 4 | +These tests run the complete evaluation pipeline with real services: |
| 5 | +- Real Lightspeed-stack API on localhost:8080 |
| 6 | +- Real OpenAI API (requires OPENAI_API_KEY) |
| 7 | +- Real evaluation metrics (Ragas, DeepEval, etc.) |
| 8 | +
|
| 9 | +Prerequisites: |
| 10 | + - Lightspeed-stack API running on localhost:8080 |
| 11 | + - OPENAI_API_KEY environment variable set |
| 12 | + - Network connectivity for API calls |
| 13 | +
|
| 14 | +Run with: pytest tests/integration/ -v -m integration |
| 15 | +""" |
| 16 | + |
| 17 | +import os |
| 18 | +from pathlib import Path |
| 19 | + |
| 20 | +import httpx |
| 21 | +import pytest |
| 22 | + |
| 23 | +from lightspeed_evaluation import ConfigLoader, evaluate |
| 24 | +from lightspeed_evaluation.core.models import EvaluationResult |
| 25 | + |
| 26 | + |
| 27 | +def check_api_available() -> bool: |
| 28 | + """Check if Lightspeed-stack API is available on localhost:8080.""" |
| 29 | + try: |
| 30 | + # Check root endpoint since /health may not exist |
| 31 | + response = httpx.get("http://localhost:8080/v1/models", timeout=2.0) |
| 32 | + return response.status_code == 200 |
| 33 | + except (httpx.ConnectError, httpx.TimeoutException): |
| 34 | + return False |
| 35 | + |
| 36 | + |
| 37 | +def check_openai_key_available() -> bool: |
| 38 | + """Check if OPENAI_API_KEY is set in environment.""" |
| 39 | + return bool(os.getenv("OPENAI_API_KEY")) |
| 40 | + |
| 41 | + |
| 42 | +# Mark ALL tests in this file as integration tests |
| 43 | +# These tests will NOT run by default - must explicitly run with: pytest -m integration |
| 44 | +pytestmark = pytest.mark.integration |
| 45 | + |
| 46 | + |
| 47 | +@pytest.fixture |
| 48 | +def integration_test_dir() -> Path: |
| 49 | + """Get the integration test directory path.""" |
| 50 | + return Path(__file__).parent |
| 51 | + |
| 52 | + |
| 53 | +@pytest.fixture |
| 54 | +def eval_data_path(integration_test_dir: Path) -> Path: |
| 55 | + """Get path to test evaluation data file.""" |
| 56 | + return integration_test_dir / "test_evaluation_data.yaml" |
| 57 | + |
| 58 | + |
| 59 | +@pytest.fixture |
| 60 | +def query_config_path(integration_test_dir: Path) -> Path: |
| 61 | + """Get path to query endpoint system config file.""" |
| 62 | + return integration_test_dir / "system-config-query.yaml" |
| 63 | + |
| 64 | + |
| 65 | +@pytest.fixture |
| 66 | +def streaming_config_path(integration_test_dir: Path) -> Path: |
| 67 | + """Get path to streaming endpoint system config file.""" |
| 68 | + return integration_test_dir / "system-config-streaming.yaml" |
| 69 | + |
| 70 | + |
| 71 | +class TestFullEvaluation: |
| 72 | + """End-to-end tests for full evaluation with both query and streaming endpoints.""" |
| 73 | + |
| 74 | + @pytest.mark.parametrize( |
| 75 | + "config_fixture,endpoint_type", |
| 76 | + [ |
| 77 | + ("query_config_path", "query"), |
| 78 | + ("streaming_config_path", "streaming"), |
| 79 | + ], |
| 80 | + ) |
| 81 | + def test_full_evaluation_endpoint( # pylint: disable=too-many-locals |
| 82 | + self, |
| 83 | + config_fixture: str, |
| 84 | + endpoint_type: str, |
| 85 | + eval_data_path: Path, |
| 86 | + request: pytest.FixtureRequest, |
| 87 | + tmp_path: Path, |
| 88 | + ) -> None: |
| 89 | + """Test complete evaluation with both query and streaming endpoints. |
| 90 | +
|
| 91 | + This test verifies: |
| 92 | + - System config loads correctly |
| 93 | + - Evaluation data loads correctly |
| 94 | + - API calls are made to localhost:8080 |
| 95 | + - LLM judge evaluates responses |
| 96 | + - Pipeline executes without errors |
| 97 | + - Results are PASS (evaluation succeeds) |
| 98 | +
|
| 99 | + Args: |
| 100 | + config_fixture: Name of the fixture providing config path |
| 101 | + endpoint_type: Type of endpoint ('query' or 'streaming') |
| 102 | + eval_data_path: Path to evaluation data YAML |
| 103 | + request: Pytest fixture request object |
| 104 | + tmp_path: Temporary directory for output |
| 105 | + """ |
| 106 | + # Get the actual config path from the fixture |
| 107 | + config_path = request.getfixturevalue(config_fixture) |
| 108 | + |
| 109 | + # Load configuration |
| 110 | + loader = ConfigLoader() |
| 111 | + system_config = loader.load_system_config(str(config_path)) |
| 112 | + |
| 113 | + # Override output directory to use temporary path |
| 114 | + system_config.output.output_dir = str(tmp_path / "eval_output") |
| 115 | + |
| 116 | + # Verify endpoint type matches expectation |
| 117 | + assert ( |
| 118 | + system_config.api.endpoint_type == endpoint_type |
| 119 | + ), f"Config should use {endpoint_type} endpoint" |
| 120 | + |
| 121 | + # Load evaluation data |
| 122 | + from lightspeed_evaluation.core.system import DataValidator |
| 123 | + |
| 124 | + validator = DataValidator( |
| 125 | + api_enabled=system_config.api.enabled, |
| 126 | + fail_on_invalid_data=system_config.core.fail_on_invalid_data, |
| 127 | + ) |
| 128 | + evaluation_data = validator.load_evaluation_data(str(eval_data_path)) |
| 129 | + |
| 130 | + # Verify evaluation data loaded |
| 131 | + assert len(evaluation_data) > 0, "Evaluation data should not be empty" |
| 132 | + assert evaluation_data[0].conversation_group_id == "conv_group_1" |
| 133 | + assert len(evaluation_data[0].turns) > 0 |
| 134 | + |
| 135 | + # Run evaluation (makes real API calls) |
| 136 | + results = evaluate(system_config, evaluation_data) |
| 137 | + |
| 138 | + # Verify results |
| 139 | + assert isinstance(results, list), "Results should be a list" |
| 140 | + assert len(results) > 0, "Should have at least one result" |
| 141 | + |
| 142 | + # Verify all results are EvaluationResult instances |
| 143 | + for result in results: |
| 144 | + assert isinstance(result, EvaluationResult) |
| 145 | + assert result.conversation_group_id == "conv_group_1" |
| 146 | + assert result.turn_id == "turn_id1" |
| 147 | + assert result.metric_identifier is not None |
| 148 | + |
| 149 | + # Verify we have the expected metric |
| 150 | + metric_identifiers = [r.metric_identifier for r in results] |
| 151 | + assert "ragas:response_relevancy" in metric_identifiers |
| 152 | + |
| 153 | + # Find the response_relevancy result |
| 154 | + relevancy_result = next( |
| 155 | + r for r in results if r.metric_identifier == "ragas:response_relevancy" |
| 156 | + ) |
| 157 | + |
| 158 | + # Verify the evaluation PASSED |
| 159 | + assert relevancy_result.result == "PASS", ( |
| 160 | + f"Evaluation should PASS but got {relevancy_result.result}. " |
| 161 | + f"Reason: {relevancy_result.reason}" |
| 162 | + ) |
| 163 | + |
| 164 | + # Verify threshold is correct (from test data) |
| 165 | + assert relevancy_result.threshold == 0.9, "Threshold should match test data" |
| 166 | + |
| 167 | + # Verify score is above threshold |
| 168 | + assert relevancy_result.response.strip(), "Should have response from API" |
| 169 | + assert ( |
| 170 | + relevancy_result.score >= relevancy_result.threshold # type: ignore |
| 171 | + ), f"Score {relevancy_result.score} should be >= threshold {relevancy_result.threshold}" |
| 172 | + |
| 173 | + # Verify we got a response from the API |
| 174 | + assert relevancy_result.response is not None, "Should have response from API" |
| 175 | + assert relevancy_result.query == "What is the capital of France?" |
| 176 | + |
| 177 | + # Verify API token usage is tracked |
| 178 | + assert ( |
| 179 | + relevancy_result.api_input_tokens > 0 |
| 180 | + or relevancy_result.api_output_tokens > 0 |
| 181 | + ), "API token usage should be tracked" |
| 182 | + |
| 183 | + @pytest.mark.parametrize( |
| 184 | + "config_fixture,endpoint_type", |
| 185 | + [ |
| 186 | + ("query_config_path", "query"), |
| 187 | + ("streaming_config_path", "streaming"), |
| 188 | + ], |
| 189 | + ) |
| 190 | + def test_api_response_enrichment( |
| 191 | + self, |
| 192 | + config_fixture: str, |
| 193 | + endpoint_type: str, |
| 194 | + eval_data_path: Path, |
| 195 | + request: pytest.FixtureRequest, |
| 196 | + tmp_path: Path, |
| 197 | + ) -> None: |
| 198 | + """Test that API responses properly enrich evaluation data. |
| 199 | +
|
| 200 | + Verifies: |
| 201 | + - API returns responses for queries |
| 202 | + - Token usage is tracked |
| 203 | + - Contexts are retrieved (if available) |
| 204 | + - Response is non-empty |
| 205 | +
|
| 206 | + Args: |
| 207 | + config_fixture: Name of the fixture providing config path |
| 208 | + endpoint_type: Type of endpoint ('query' or 'streaming') |
| 209 | + eval_data_path: Path to evaluation data YAML |
| 210 | + request: Pytest fixture request object |
| 211 | + tmp_path: Temporary directory for output |
| 212 | + """ |
| 213 | + # Get the actual config path from the fixture |
| 214 | + config_path = request.getfixturevalue(config_fixture) |
| 215 | + |
| 216 | + loader = ConfigLoader() |
| 217 | + system_config = loader.load_system_config(str(config_path)) |
| 218 | + system_config.output.output_dir = str(tmp_path / "eval_output") |
| 219 | + |
| 220 | + # Verify endpoint type matches expectation |
| 221 | + assert ( |
| 222 | + system_config.api.endpoint_type == endpoint_type |
| 223 | + ), f"Config should use {endpoint_type} endpoint" |
| 224 | + |
| 225 | + from lightspeed_evaluation.core.system import DataValidator |
| 226 | + |
| 227 | + validator = DataValidator( |
| 228 | + api_enabled=system_config.api.enabled, |
| 229 | + fail_on_invalid_data=system_config.core.fail_on_invalid_data, |
| 230 | + ) |
| 231 | + evaluation_data = validator.load_evaluation_data(str(eval_data_path)) |
| 232 | + |
| 233 | + # Run evaluation |
| 234 | + results = evaluate(system_config, evaluation_data) |
| 235 | + |
| 236 | + # Verify at least one result has API data |
| 237 | + has_api_data = any( |
| 238 | + r.response.strip() and (r.api_input_tokens > 0 or r.api_output_tokens > 0) |
| 239 | + for r in results |
| 240 | + ) |
| 241 | + assert has_api_data, "At least one result should have API response data" |
| 242 | + |
| 243 | + # Find a result with response data |
| 244 | + result_with_response = next( |
| 245 | + r |
| 246 | + for r in results |
| 247 | + if r.response.strip() |
| 248 | + and (r.api_input_tokens > 0 or r.api_output_tokens > 0) |
| 249 | + ) |
| 250 | + |
| 251 | + # Verify response is not empty |
| 252 | + assert ( |
| 253 | + len(result_with_response.response) > 0 |
| 254 | + ), "API response should not be empty" |
| 255 | + |
| 256 | + # Verify token usage |
| 257 | + assert ( |
| 258 | + result_with_response.api_input_tokens > 0 |
| 259 | + ), "Input tokens should be tracked" |
| 260 | + assert ( |
| 261 | + result_with_response.api_output_tokens > 0 |
| 262 | + ), "Output tokens should be tracked" |
| 263 | + |
| 264 | + |
| 265 | +class TestIntegrationConfiguration: |
| 266 | + """Tests for integration test configuration and prerequisites.""" |
| 267 | + |
| 268 | + def test_api_connectivity(self) -> None: |
| 269 | + """Verify that the Lightspeed-stack API is accessible.""" |
| 270 | + assert ( |
| 271 | + check_api_available() |
| 272 | + ), "Lightspeed-stack API should be available on localhost:8080" |
| 273 | + |
| 274 | + def test_openai_key_configured(self) -> None: |
| 275 | + """Verify that OPENAI_API_KEY is configured.""" |
| 276 | + assert ( |
| 277 | + check_openai_key_available() |
| 278 | + ), "OPENAI_API_KEY environment variable should be set" |
0 commit comments