Skip to content

Commit 21fff16

Browse files
chore: refactor e2e tests
1 parent 5e8be40 commit 21fff16

7 files changed

Lines changed: 303 additions & 46 deletions

File tree

.github/workflows/e2e_tests.yaml

Lines changed: 3 additions & 43 deletions
Original file line numberDiff line numberDiff line change
@@ -8,19 +8,14 @@ jobs:
88
e2e_tests:
99
runs-on: ubuntu-latest
1010

11-
#name: "Lightspeed-stack setup"
12-
1311
strategy:
14-
# For local testing use matrix with just one variant, "act" doesn't separate runs
1512
matrix:
16-
mode: ["query", "streaming"]
17-
eval-data: ["tests/integration/test_evaluation_data.yaml"]
1813
lsc_image_path: ["quay.io/lightspeed-core/lightspeed-stack:latest"]
1914

20-
name: "E2E Lightspeed Evaluation Test, mode: ${{ matrix.mode }}"
15+
name: "E2E Lightspeed Evaluation Test"
2116

2217
env:
23-
LSC_IMAGE_NAME: "lightspeed-stack-test-mode-${{ matrix.mode }}"
18+
LSC_IMAGE_NAME: "lightspeed-stack-test"
2419

2520
steps:
2621
# Stolen from lightspeed-stack
@@ -90,17 +85,6 @@ jobs:
9085
echo "Service did not start in time"
9186
exit 1
9287
93-
# Query mode
94-
- name: Set query mode
95-
if: matrix.mode == 'query'
96-
run: |
97-
echo "CONFIG=./tests/integration/system-config-query.yaml" >> $GITHUB_ENV
98-
99-
- name: Set streaming mode
100-
if: matrix.mode == 'streaming'
101-
run: |
102-
echo "CONFIG=./tests/integration/system-config-streaming.yaml" >> $GITHUB_ENV
103-
10488
# Dependencies
10589
- name: Install dependencies for Lightspeed Evaluation
10690
env:
@@ -118,31 +102,7 @@ jobs:
118102
FORCE_COLOR: 1
119103
OPENAI_API_KEY: ${{ secrets.OPENAI_API_KEY }}
120104
run: |
121-
echo "============================="
122-
echo "Running..."
123-
echo " config: ${CONFIG}"
124-
echo " LSC image: ${{ matrix.lsc_image_path }}"
125-
echo "============================="
126-
uv run lightspeed-eval --system-config "${CONFIG}" --eval-data "${{ matrix.eval-data }}"
127-
128-
# Check the result
129-
- name: Check test result
130-
run: |
131-
OUT_FILES=( eval_output/evaluation_*_summary.json )
132-
if [ ${#OUT_FILES[@]} != 1 ] ; then
133-
echo "Multiple output files: " eval_output/evaluation_*_summary.json
134-
exit 1
135-
fi
136-
OUT_FILE=${OUT_FILES[0]}
137-
PASS=$( jq .summary_stats.overall.PASS $OUT_FILE )
138-
EXPECTED="1"
139-
if [ ${PASS} != ${EXPECTED} ] ; then
140-
echo "============"
141-
echo "Wrong PASS number in ${OUT_FILE}: got ${PASS}, expected ${EXPECTED}"
142-
echo "============"
143-
exit 1
144-
fi
145-
105+
make e2e_tests
146106
147107
# Cleanup
148108
- name: Stop the LSC if in local devel

Makefile

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -59,6 +59,9 @@ distribution-archives: ## Generate distribution archives to be uploaded into Pyt
5959
test: install-deps-test ## Execute tests with Pytest
6060
uv run pytest tests lsc_agent_eval/tests
6161

62+
e2e_tests: install-deps-test
63+
uv run pytest tests/integration -v -m integration
64+
6265
pre-commit: black-check docstyle pyright pylint ruff check-types bandit
6366
@echo "All checks successful"
6467

pyproject.toml

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -67,6 +67,7 @@ dev = [
6767
"pytest>=8.3.2",
6868
"pytest-cov>=5.0.0",
6969
"pytest-mock>=3.15.1",
70+
"pytest-timeout>=2.4.0",
7071
]
7172

7273
[project.scripts]

pytest.ini

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,4 @@
1-
[tool:pytest]
1+
[pytest]
22
# Pytest configuration for LightSpeed Evaluation Framework
33

44
# Test discovery
@@ -8,12 +8,12 @@ python_classes = Test*
88
python_functions = test_*
99

1010
# Output options
11-
addopts =
11+
addopts =
1212
-v
1313
--tb=short
1414
--strict-markers
15-
--disable-warnings
1615
--color=yes
16+
-m "not integration"
1717

1818
# Markers
1919
markers =

tests/integration/test_evaluation_data.yaml

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -9,6 +9,7 @@
99
turns:
1010
- turn_id: turn_id1
1111
query: What is the capital of France?
12+
response: null # Force API call by explicitly setting to null
1213
expected_response: Paris
1314
expected_tool_calls: null
1415
turn_metrics:
Lines changed: 278 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,278 @@
1+
# pylint: disable=redefined-outer-name,too-many-arguments,too-many-positional-arguments,import-outside-toplevel
2+
"""End-to-End Integration tests for LightSpeed Evaluation Framework.
3+
4+
These tests run the complete evaluation pipeline with real services:
5+
- Real Lightspeed-stack API on localhost:8080
6+
- Real OpenAI API (requires OPENAI_API_KEY)
7+
- Real evaluation metrics (Ragas, DeepEval, etc.)
8+
9+
Prerequisites:
10+
- Lightspeed-stack API running on localhost:8080
11+
- OPENAI_API_KEY environment variable set
12+
- Network connectivity for API calls
13+
14+
Run with: pytest tests/integration/ -v -m integration
15+
"""
16+
17+
import os
18+
from pathlib import Path
19+
20+
import httpx
21+
import pytest
22+
23+
from lightspeed_evaluation import ConfigLoader, evaluate
24+
from lightspeed_evaluation.core.models import EvaluationResult
25+
26+
27+
def check_api_available() -> bool:
28+
"""Check if Lightspeed-stack API is available on localhost:8080."""
29+
try:
30+
# Check root endpoint since /health may not exist
31+
response = httpx.get("http://localhost:8080/v1/models", timeout=2.0)
32+
return response.status_code == 200
33+
except (httpx.ConnectError, httpx.TimeoutException):
34+
return False
35+
36+
37+
def check_openai_key_available() -> bool:
38+
"""Check if OPENAI_API_KEY is set in environment."""
39+
return bool(os.getenv("OPENAI_API_KEY"))
40+
41+
42+
# Mark ALL tests in this file as integration tests
43+
# These tests will NOT run by default - must explicitly run with: pytest -m integration
44+
pytestmark = pytest.mark.integration
45+
46+
47+
@pytest.fixture
48+
def integration_test_dir() -> Path:
49+
"""Get the integration test directory path."""
50+
return Path(__file__).parent
51+
52+
53+
@pytest.fixture
54+
def eval_data_path(integration_test_dir: Path) -> Path:
55+
"""Get path to test evaluation data file."""
56+
return integration_test_dir / "test_evaluation_data.yaml"
57+
58+
59+
@pytest.fixture
60+
def query_config_path(integration_test_dir: Path) -> Path:
61+
"""Get path to query endpoint system config file."""
62+
return integration_test_dir / "system-config-query.yaml"
63+
64+
65+
@pytest.fixture
66+
def streaming_config_path(integration_test_dir: Path) -> Path:
67+
"""Get path to streaming endpoint system config file."""
68+
return integration_test_dir / "system-config-streaming.yaml"
69+
70+
71+
class TestFullEvaluation:
72+
"""End-to-end tests for full evaluation with both query and streaming endpoints."""
73+
74+
@pytest.mark.parametrize(
75+
"config_fixture,endpoint_type",
76+
[
77+
("query_config_path", "query"),
78+
("streaming_config_path", "streaming"),
79+
],
80+
)
81+
def test_full_evaluation_endpoint( # pylint: disable=too-many-locals
82+
self,
83+
config_fixture: str,
84+
endpoint_type: str,
85+
eval_data_path: Path,
86+
request: pytest.FixtureRequest,
87+
tmp_path: Path,
88+
) -> None:
89+
"""Test complete evaluation with both query and streaming endpoints.
90+
91+
This test verifies:
92+
- System config loads correctly
93+
- Evaluation data loads correctly
94+
- API calls are made to localhost:8080
95+
- LLM judge evaluates responses
96+
- Pipeline executes without errors
97+
- Results are PASS (evaluation succeeds)
98+
99+
Args:
100+
config_fixture: Name of the fixture providing config path
101+
endpoint_type: Type of endpoint ('query' or 'streaming')
102+
eval_data_path: Path to evaluation data YAML
103+
request: Pytest fixture request object
104+
tmp_path: Temporary directory for output
105+
"""
106+
# Get the actual config path from the fixture
107+
config_path = request.getfixturevalue(config_fixture)
108+
109+
# Load configuration
110+
loader = ConfigLoader()
111+
system_config = loader.load_system_config(str(config_path))
112+
113+
# Override output directory to use temporary path
114+
system_config.output.output_dir = str(tmp_path / "eval_output")
115+
116+
# Verify endpoint type matches expectation
117+
assert (
118+
system_config.api.endpoint_type == endpoint_type
119+
), f"Config should use {endpoint_type} endpoint"
120+
121+
# Load evaluation data
122+
from lightspeed_evaluation.core.system import DataValidator
123+
124+
validator = DataValidator(
125+
api_enabled=system_config.api.enabled,
126+
fail_on_invalid_data=system_config.core.fail_on_invalid_data,
127+
)
128+
evaluation_data = validator.load_evaluation_data(str(eval_data_path))
129+
130+
# Verify evaluation data loaded
131+
assert len(evaluation_data) > 0, "Evaluation data should not be empty"
132+
assert evaluation_data[0].conversation_group_id == "conv_group_1"
133+
assert len(evaluation_data[0].turns) > 0
134+
135+
# Run evaluation (makes real API calls)
136+
results = evaluate(system_config, evaluation_data)
137+
138+
# Verify results
139+
assert isinstance(results, list), "Results should be a list"
140+
assert len(results) > 0, "Should have at least one result"
141+
142+
# Verify all results are EvaluationResult instances
143+
for result in results:
144+
assert isinstance(result, EvaluationResult)
145+
assert result.conversation_group_id == "conv_group_1"
146+
assert result.turn_id == "turn_id1"
147+
assert result.metric_identifier is not None
148+
149+
# Verify we have the expected metric
150+
metric_identifiers = [r.metric_identifier for r in results]
151+
assert "ragas:response_relevancy" in metric_identifiers
152+
153+
# Find the response_relevancy result
154+
relevancy_result = next(
155+
r for r in results if r.metric_identifier == "ragas:response_relevancy"
156+
)
157+
158+
# Verify the evaluation PASSED
159+
assert relevancy_result.result == "PASS", (
160+
f"Evaluation should PASS but got {relevancy_result.result}. "
161+
f"Reason: {relevancy_result.reason}"
162+
)
163+
164+
# Verify threshold is correct (from test data)
165+
assert relevancy_result.threshold == 0.9, "Threshold should match test data"
166+
167+
# Verify score is above threshold
168+
assert relevancy_result.response.strip(), "Should have response from API"
169+
assert (
170+
relevancy_result.score >= relevancy_result.threshold # type: ignore
171+
), f"Score {relevancy_result.score} should be >= threshold {relevancy_result.threshold}"
172+
173+
# Verify we got a response from the API
174+
assert relevancy_result.response is not None, "Should have response from API"
175+
assert relevancy_result.query == "What is the capital of France?"
176+
177+
# Verify API token usage is tracked
178+
assert (
179+
relevancy_result.api_input_tokens > 0
180+
or relevancy_result.api_output_tokens > 0
181+
), "API token usage should be tracked"
182+
183+
@pytest.mark.parametrize(
184+
"config_fixture,endpoint_type",
185+
[
186+
("query_config_path", "query"),
187+
("streaming_config_path", "streaming"),
188+
],
189+
)
190+
def test_api_response_enrichment(
191+
self,
192+
config_fixture: str,
193+
endpoint_type: str,
194+
eval_data_path: Path,
195+
request: pytest.FixtureRequest,
196+
tmp_path: Path,
197+
) -> None:
198+
"""Test that API responses properly enrich evaluation data.
199+
200+
Verifies:
201+
- API returns responses for queries
202+
- Token usage is tracked
203+
- Contexts are retrieved (if available)
204+
- Response is non-empty
205+
206+
Args:
207+
config_fixture: Name of the fixture providing config path
208+
endpoint_type: Type of endpoint ('query' or 'streaming')
209+
eval_data_path: Path to evaluation data YAML
210+
request: Pytest fixture request object
211+
tmp_path: Temporary directory for output
212+
"""
213+
# Get the actual config path from the fixture
214+
config_path = request.getfixturevalue(config_fixture)
215+
216+
loader = ConfigLoader()
217+
system_config = loader.load_system_config(str(config_path))
218+
system_config.output.output_dir = str(tmp_path / "eval_output")
219+
220+
# Verify endpoint type matches expectation
221+
assert (
222+
system_config.api.endpoint_type == endpoint_type
223+
), f"Config should use {endpoint_type} endpoint"
224+
225+
from lightspeed_evaluation.core.system import DataValidator
226+
227+
validator = DataValidator(
228+
api_enabled=system_config.api.enabled,
229+
fail_on_invalid_data=system_config.core.fail_on_invalid_data,
230+
)
231+
evaluation_data = validator.load_evaluation_data(str(eval_data_path))
232+
233+
# Run evaluation
234+
results = evaluate(system_config, evaluation_data)
235+
236+
# Verify at least one result has API data
237+
has_api_data = any(
238+
r.response.strip() and (r.api_input_tokens > 0 or r.api_output_tokens > 0)
239+
for r in results
240+
)
241+
assert has_api_data, "At least one result should have API response data"
242+
243+
# Find a result with response data
244+
result_with_response = next(
245+
r
246+
for r in results
247+
if r.response.strip()
248+
and (r.api_input_tokens > 0 or r.api_output_tokens > 0)
249+
)
250+
251+
# Verify response is not empty
252+
assert (
253+
len(result_with_response.response) > 0
254+
), "API response should not be empty"
255+
256+
# Verify token usage
257+
assert (
258+
result_with_response.api_input_tokens > 0
259+
), "Input tokens should be tracked"
260+
assert (
261+
result_with_response.api_output_tokens > 0
262+
), "Output tokens should be tracked"
263+
264+
265+
class TestIntegrationConfiguration:
266+
"""Tests for integration test configuration and prerequisites."""
267+
268+
def test_api_connectivity(self) -> None:
269+
"""Verify that the Lightspeed-stack API is accessible."""
270+
assert (
271+
check_api_available()
272+
), "Lightspeed-stack API should be available on localhost:8080"
273+
274+
def test_openai_key_configured(self) -> None:
275+
"""Verify that OPENAI_API_KEY is configured."""
276+
assert (
277+
check_openai_key_available()
278+
), "OPENAI_API_KEY environment variable should be set"

0 commit comments

Comments
 (0)