Skip to content
144 changes: 132 additions & 12 deletions src/lightspeed_evaluation/core/api/client.py
Original file line number Diff line number Diff line change
Expand Up @@ -27,12 +27,19 @@
logger = logging.getLogger(__name__)


def _is_too_many_requests_error(exception: BaseException) -> bool:
"""Check if exception is a 429 error."""
return (
isinstance(exception, httpx.HTTPStatusError)
and exception.response.status_code == 429
)
def _is_retryable_server_error(exception: BaseException) -> bool:
"""Check if exception is a retryable HTTP error (429 or 5xx).

Args:
exception: The exception to check.

Returns:
True if the exception is a retryable HTTP status error.
"""
if not isinstance(exception, httpx.HTTPStatusError):
return False
status = exception.response.status_code
return status == 429 or 500 <= status < 600
Copy link
Copy Markdown
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Instead of range, could you please write the exact status code like 502, 503..

I understand the purpose, but at the same time I am concerned that we will end up wasting retries.
In future we will probably make it configurable.

Copy link
Copy Markdown
Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Done — narrowed to status in (429, 502, 503, 504). Only clearly transient errors: 429=rate limit, 502=bad gateway, 503=unavailable, 504=gateway timeout. 500 excluded since it can indicate permanent server bugs.



class APIClient:
Expand All @@ -59,10 +66,11 @@ def __init__(
retry_decorator = self._create_retry_decorator()
self._standard_query_with_retry = retry_decorator(self._standard_query)
self._streaming_query_with_retry = retry_decorator(self._streaming_query)
self._rlsapi_infer_query_with_retry = retry_decorator(self._rlsapi_infer_query)

def _create_retry_decorator(self) -> Any:
return retry(
retry=retry_if_exception(_is_too_many_requests_error),
retry=retry_if_exception(_is_retryable_server_error),
stop=stop_after_attempt(
self.config.num_retries + 1
), # +1 to account for the initial attempt
Expand Down Expand Up @@ -186,6 +194,8 @@ def query(

if self.config.endpoint_type == "streaming":
response = self._streaming_query_with_retry(api_request)
elif self.config.endpoint_type == "infer":
response = self._rlsapi_infer_query_with_retry(api_request)
else:
response = self._standard_query_with_retry(api_request)

Expand All @@ -196,7 +206,7 @@ def query(
except RetryError as e:
raise APIError(
f"Maximum retry attempts ({self.config.num_retries}) reached "
"due to persistent rate limiting (HTTP 429)."
"due to retryable server errors (HTTP 429/5xx)."
) from e

def _prepare_request(
Expand Down Expand Up @@ -285,8 +295,7 @@ def _standard_query(self, api_request: APIRequest) -> APIResponse:
except httpx.TimeoutException as e:
raise self._handle_timeout_error("standard", self.config.timeout) from e
except httpx.HTTPStatusError as e:
# Re-raise 429 errors without conversion to allow retry decorator to handle them
if e.response.status_code == 429:
if _is_retryable_server_error(e):
raise
raise self._handle_http_error(e) from e
except ValueError as e:
Expand All @@ -313,8 +322,7 @@ def _streaming_query(self, api_request: APIRequest) -> APIResponse:
except httpx.TimeoutException as e:
raise self._handle_timeout_error("streaming", self.config.timeout) from e
except httpx.HTTPStatusError as e:
# Re-raise 429 errors without conversion to allow retry decorator to handle them
if e.response.status_code == 429:
if _is_retryable_server_error(e):
raise
raise self._handle_http_error(e) from e
except ValueError as e:
Expand All @@ -324,6 +332,118 @@ def _streaming_query(self, api_request: APIRequest) -> APIResponse:
except Exception as e:
raise self._handle_unexpected_error(e, "streaming query") from e

def _rlsapi_infer_query(self, api_request: APIRequest) -> APIResponse:
"""Query the RLSAPI /infer endpoint for tool call and RAG metadata.

The infer endpoint uses a different request/response format than
the standard query/streaming endpoints, converting "query" to
"question" and parsing tool_calls and rag_chunks from tool_results.

Args:
api_request: The prepared API request.

Returns:
APIResponse with response text, tool calls, and RAG contexts.

Raises:
APIError: If the request fails or response is invalid.
"""
if not self.client:
raise APIError("HTTP client not initialized")
try:
request_data = api_request.model_dump(exclude_none=True)
infer_request: dict[str, object] = {
"question": request_data.pop("query"),
"include_metadata": True,
}
Copy link
Copy Markdown
Contributor

@coderabbitai coderabbitai Bot Apr 23, 2026

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

⚠️ Potential issue | 🟠 Major

Don’t drop prepared request parameters for /infer.

Line 356 sends only question and include_metadata, so extra_request_params and other prepared request fields are ignored for endpoint_type == "infer". If /infer supports only a subset, filter explicitly instead of silently discarding everything.

Proposed fix
-            request_data = api_request.model_dump(exclude_none=True)
+            request_data = self._serialize_request(api_request)
+            question = request_data.pop("query")
             infer_request: dict[str, object] = {
-                "question": request_data.pop("query"),
+                **request_data,
+                "question": question,
                 "include_metadata": True,
             }
🤖 Prompt for AI Agents
Verify each finding against the current code and only fix it if needed.

In `@src/lightspeed_evaluation/core/api/client.py` around lines 354 - 358, The
current logic in the block that builds infer_request (using
api_request.model_dump -> request_data and then setting infer_request =
{"question": request_data.pop("query"), "include_metadata": True}) silently
drops prepared fields like extra_request_params when endpoint_type == "infer";
update the construction in the function that handles endpoint_type == "infer" to
explicitly preserve or merge only allowed keys from request_data (e.g., merge
request_data.get("extra_request_params", {}) or filter request_data by an
allowed list) into infer_request instead of discarding request_data, referencing
the variables request_data, infer_request and the api_request.model_dump call to
locate the code.

Copy link
Copy Markdown
Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Fixed — added a code comment explaining that extra_request_params are not forwarded to /infer since the endpoint only accepts question and include_metadata. Other params (model, provider, etc.) are not part of the RLSAPI /infer API contract.

Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Only users with a collaborator, contributor, member, or owner role can interact with CodeRabbit.


logger.debug(
"RLSAPI infer request URL: /api/lightspeed/%s/infer",
self.config.version,
)
logger.debug("RLSAPI infer request body: %s", infer_request)
Comment thread
coderabbitai[bot] marked this conversation as resolved.
Outdated

response = self.client.post(
f"/api/lightspeed/{self.config.version}/infer",
json=infer_request,
)
response.raise_for_status()

response_data = response.json()

if "data" in response_data:
data = response_data["data"]
if "text" in data:
response_data["response"] = data["text"]
if "request_id" in data:
response_data["conversation_id"] = data["request_id"]
if "input_tokens" in data:
response_data["input_tokens"] = data["input_tokens"]
if "output_tokens" in data:
response_data["output_tokens"] = data["output_tokens"]
if "tool_calls" in data:
response_data["tool_calls"] = data["tool_calls"]
if "tool_results" in data:
tool_results = data["tool_results"]
for result in tool_results:
if result.get("type") == "mcp_call":
content = result["content"].split("---")
response_data["rag_chunks"] = [
{"content": chunk} for chunk in content
]
Comment thread
coderabbitai[bot] marked this conversation as resolved.
Outdated

if "response" not in response_data:
raise APIError("API response missing 'response' field")

if "tool_calls" in response_data and response_data["tool_calls"]:
raw_tool_calls = response_data["tool_calls"]
formatted_tool_calls = []

for tool_call in raw_tool_calls:
if isinstance(tool_call, dict):
formatted_tool: dict[str, object] = {
"tool_name": (
tool_call.get("tool_name")
or tool_call.get("name")
or ""
Copy link
Copy Markdown
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

tool_name & name were used to make it compatible with older LCORE version. But is it the same scenario for RLSAPI ? can we use actual property name ?

Copy link
Copy Markdown
Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Good point — RLSAPI uses its own native fields (name/args), not the legacy OLS tool_name/arguments format. Removed the fallback chain in _rlsapi_infer_query and now use tool_call.get("name", "") and tool_call.get("args", {}) directly. The legacy fallback in _standard_query is untouched since OLS may still use either format.

Copy link
Copy Markdown
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Looking at the whole flow, I think that this (overall API interaction/data processing) can be modularized further. Non-blocker for this PR.

Copy link
Copy Markdown
Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Agreed. The latest refactoring commit (3e0af30) broke _rlsapi_infer_query into _build_infer_request, _extract_infer_data, and _format_infer_tool_calls — a step in that direction. Happy to continue modularizing in a follow-up.

),
"arguments": (
tool_call.get("arguments")
or tool_call.get("args")
or {}
),
}
if "tool_results" in response_data.get("data", {}):
tool_call_id = tool_call.get("id")
matching_result = next(
(
r
for r in response_data["data"]["tool_results"]
if r.get("id") == tool_call_id
),
None,
)
if matching_result:
formatted_tool["result"] = matching_result["status"]
formatted_tool_calls.append([formatted_tool])
Comment thread
coderabbitai[bot] marked this conversation as resolved.
Outdated

response_data["tool_calls"] = formatted_tool_calls

return APIResponse.from_raw_response(response_data)

except httpx.TimeoutException as e:
raise self._handle_timeout_error("infer", self.config.timeout) from e
except httpx.HTTPStatusError as e:
if _is_retryable_server_error(e):
raise
raise self._handle_http_error(e) from e
except ValueError as e:
raise self._handle_validation_error(e) from e
except APIError:
raise
except Exception as e:
raise self._handle_unexpected_error(e, "infer query") from e

def _handle_response_errors(self, response: httpx.Response) -> None:
"""Handle HTTP response errors for streaming endpoint."""
if response.status_code != 200:
Expand Down
4 changes: 2 additions & 2 deletions src/lightspeed_evaluation/core/constants.py
Original file line number Diff line number Diff line change
Expand Up @@ -56,7 +56,7 @@
DEFAULT_API_VERSION = "v1"
DEFAULT_API_TIMEOUT = 300
DEFAULT_ENDPOINT_TYPE = "streaming"
SUPPORTED_ENDPOINT_TYPES = ["streaming", "query"]
SUPPORTED_ENDPOINT_TYPES = ["streaming", "query", "infer"]
DEFAULT_API_CACHE_DIR = ".caches/api_cache"

DEFAULT_API_NUM_RETRIES = 3
Comment on lines +59 to 62
Copy link
Copy Markdown
Contributor

@coderabbitai coderabbitai Bot Apr 23, 2026

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

⚠️ Potential issue | 🟡 Minor

Inconsistent retry default: API retries still 3 while LLM retries bumped to 5.

The PR description states default retry attempts were increased from 3 to 5, but DEFAULT_API_NUM_RETRIES (line 62) is still 3 while only DEFAULT_LLM_RETRIES (line 73) was updated. Note that the API client's retry decorator uses APIConfig.num_retries (which defaults to DEFAULT_API_NUM_RETRIES), so the advertised retry bump does not actually reach the new 429/5xx retry path for API calls. Confirm whether the API default should also be raised to 5 for consistency.

🤖 Prompt for AI Agents
Verify each finding against the current code and only fix it if needed.

In `@src/lightspeed_evaluation/core/constants.py` around lines 59 - 62,
DEFAULT_API_NUM_RETRIES is still set to 3 while DEFAULT_LLM_RETRIES was bumped
to 5, so API calls using APIConfig.num_retries are not getting the intended
increased retry behavior; update DEFAULT_API_NUM_RETRIES to 5 to match
DEFAULT_LLM_RETRIES (or explicitly document/choose a different policy) so the
API client's retry decorator uses the new default number of retries.

Copy link
Copy Markdown
Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Addressed — reverted DEFAULT_LLM_RETRIES back to 3. As asamal4 noted, this is the judge LLM default, not API retry. Actual values are configurable per-config.

Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Only users with a collaborator, contributor, member, or owner role can interact with CodeRabbit.

Expand All @@ -70,7 +70,7 @@
DEFAULT_SSL_CERT_FILE = None
DEFAULT_LLM_TEMPERATURE = 0.0
DEFAULT_LLM_MAX_TOKENS = 512
DEFAULT_LLM_RETRIES = 3
DEFAULT_LLM_RETRIES = 5
Copy link
Copy Markdown
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This is for JudgeLLM, not for API call.. Also this is just the default value, actual value can be set in config as per the need. Do we really need to change this ?

Copy link
Copy Markdown
Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Agreed, reverted to 3. Will configure via eval config when needed.

DEFAULT_LLM_CACHE_DIR = ".caches/llm_cache"

DEFAULT_EMBEDDING_PROVIDER = "openai"
Expand Down
8 changes: 8 additions & 0 deletions src/lightspeed_evaluation/core/models/data.py
Original file line number Diff line number Diff line change
Expand Up @@ -370,6 +370,14 @@ class EvaluationData(BaseModel):
min_length=1,
description="Tag for grouping and filtering conversations",
)
skip: bool = Field(
default=False,
description="Skip this conversation during evaluation",
)
skip_reason: Optional[str] = Field(
default=None,
description="Why this conversation is skipped (documentation only)",
)

# Conversation-level metrics
conversation_metrics: Optional[list[str]] = Field(
Expand Down
2 changes: 1 addition & 1 deletion src/lightspeed_evaluation/core/models/system.py
Original file line number Diff line number Diff line change
Expand Up @@ -301,7 +301,7 @@ class APIConfig(BaseModel):
ge=0,
description=(
"Maximum number of retry attempts for API calls on "
"429 Too Many Requests errors"
"retryable server errors (HTTP 429/5xx)"
),
)

Expand Down
15 changes: 15 additions & 0 deletions src/lightspeed_evaluation/core/system/validator.py
Original file line number Diff line number Diff line change
Expand Up @@ -171,6 +171,7 @@ def load_evaluation_data(
data_path: str,
tags: Optional[list[str]] = None,
conv_ids: Optional[list[str]] = None,
metrics: Optional[list[str]] = None,
) -> list[EvaluationData]:
"""Load, filter, and validate evaluation data from YAML file.

Expand All @@ -184,6 +185,7 @@ def load_evaluation_data(
data_path: Path to the evaluation data YAML file
tags: Optional list of tags to filter by
conv_ids: Optional list of conversation group IDs to filter by
metrics: Optional list of metrics to run (filters each turn's turn_metrics)

Returns:
Filtered and validated list of Evaluation Data
Expand Down Expand Up @@ -230,6 +232,19 @@ def load_evaluation_data(
# Filter by scope before validation
evaluation_data = self._filter_by_scope(evaluation_data, tags, conv_ids)

# Remove skipped conversations
evaluation_data = [e for e in evaluation_data if not e.skip]

# Filter turn_metrics if --metrics was specified
Copy link
Copy Markdown
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Based on this logic, I see two limitations

  1. --metrics will only have turn level metric selection, not conversation level
  2. This will work only if there is a metric override from eval_data.yaml. But we can also set default metrics in system.yaml/config and not override from eval_data.

Copy link
Copy Markdown
Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Both issues addressed:

  1. turn_metrics=None handling: Fixed the guard from if turn.turn_metrics: to if turn.turn_metrics is not None:. When turn_metrics is None (use system defaults) and --metrics is specified, we now materialize the system defaults (entries with default: true) and then filter — so downstream MetricManager sees a non-None filtered list.

  2. conversation_metrics filtering: Added. Same three-way handling: None → materialize defaults then filter, [] → leave as-is (skip all), [...] → filter the explicit list.

Added 3 tests covering all cases.

Copy link
Copy Markdown
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Sorry I didn't mention that metric resolution is already handled.

Here we are adding 1. duplicate logic to process turn & conversation level logic, 2. Managing metric resolution from config/system. But this is already managed in MetricManager where we resolve the final turn and conversation level metrics. Moving the CLI filter there will eliminate the duplicate code significantly and will be more consistent.

This can be refactored later (follow up PR). I am considering this as non-blocker

Copy link
Copy Markdown
Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Agreed — I can see how MetricManager is the right home for this. Happy to have it backlogged and refactored in a follow-up. Thanks for flagging it.

if metrics:
metrics_set = set(metrics)
for eval_data in evaluation_data:
for turn in eval_data.turns:
if turn.turn_metrics:
turn.turn_metrics = [
m for m in turn.turn_metrics if m in metrics_set
]

# Semantic validation (metrics availability and requirements)
if not self._validate_evaluation_data(evaluation_data):
raise DataValidationError("Evaluation data validation failed")
Expand Down
7 changes: 7 additions & 0 deletions src/lightspeed_evaluation/runner/evaluation.py
Original file line number Diff line number Diff line change
Expand Up @@ -132,6 +132,7 @@ def run_evaluation( # pylint: disable=too-many-locals
eval_args.eval_data,
tags=eval_args.tags,
conv_ids=eval_args.conv_ids,
metrics=eval_args.metrics,
)

print(
Expand Down Expand Up @@ -236,6 +237,12 @@ def main() -> int:
default=None,
help="Filter by conversation group IDs (run only specified conversations)",
)
parser.add_argument(
"--metrics",
nargs="+",
default=None,
help="Filter to only run specified metrics (e.g. custom:answer_correctness)",
)
parser.add_argument(
"--cache-warmup",
action="store_true",
Expand Down
13 changes: 13 additions & 0 deletions tests/unit/core/api/conftest.py
Original file line number Diff line number Diff line change
Expand Up @@ -35,6 +35,19 @@ def basic_api_config_streaming_endpoint() -> APIConfig:
)


@pytest.fixture
def basic_api_config_infer_endpoint() -> APIConfig:
"""Create test API config for infer endpoint."""
return APIConfig(
enabled=True,
api_base="http://localhost:8080",
version="v1",
endpoint_type="infer",
timeout=30,
cache_enabled=False,
)


@pytest.fixture
def mock_response(mocker: MockerFixture) -> Any:
"""Create a mock streaming response."""
Expand Down
Loading
Loading