fix: address PR review feedback (asamal4 + CodeRabbit)

Lifto · Lifto · commit be8019de598a · 2026-04-27T14:44:04.000-04:00
- Revert DEFAULT_LLM_RETRIES from 5 to 3
- Narrow retry codes to (429, 502, 503, 504), exclude 500
- Use RLSAPI native fields (name/args) in _rlsapi_infer_query
- Fix RAG chunk accumulation across multiple mcp_call results
- Redact prompt from debug log, log only metadata
- Add comment about extra_request_params not forwarded to /infer
- Fix tool result capture: use content with status fallback
- Update endpoint_type description to include infer
- Move skip tests from TestFilterByScope to TestDataValidator
- Fix MockerFixture import in test_validator.py
- Fix --metrics filter: handle turn_metrics=None by materializing
  system defaults before filtering; add conversation_metrics filter
- Add metrics=None to runner test fixture for --metrics support
- Add tests for metrics filter materialization

Signed-off-by: Ellis Low &lt;elow@redhat.com&gt;
diff --git a/src/lightspeed_evaluation/core/api/client.py b/src/lightspeed_evaluation/core/api/client.py
@@ -28,7 +28,11 @@
 
 
 def _is_retryable_server_error(exception: BaseException) -> bool:
-    """Check if exception is a retryable HTTP error (429 or 5xx).
+    """Check if exception is a retryable HTTP error (429 or transient 5xx).
+
+    Only 502 Bad Gateway, 503 Service Unavailable, and 504 Gateway Timeout
+    are retried. 500 Internal Server Error is excluded as it may indicate
+    permanent server bugs.
 
     Args:
         exception: The exception to check.
@@ -39,7 +43,7 @@ def _is_retryable_server_error(exception: BaseException) -> bool:
     if not isinstance(exception, httpx.HTTPStatusError):
         return False
     status = exception.response.status_code
-    return status == 429 or 500 <= status < 600
+    return status in (429, 502, 503, 504)
 
 
 class APIClient:
@@ -352,6 +356,10 @@ def _rlsapi_infer_query(self, api_request: APIRequest) -> APIResponse:
             raise APIError("HTTP client not initialized")
         try:
             request_data = api_request.model_dump(exclude_none=True)
+            # `extra_request_params` are not forwarded to `/infer` — the
+            # endpoint only accepts `question` and `include_metadata`.
+            # Other params (model, provider, etc.) are not part of the
+            # RLSAPI `/infer` API contract.
             infer_request: dict[str, object] = {
                 "question": request_data.pop("query"),
                 "include_metadata": True,
@@ -361,7 +369,13 @@ def _rlsapi_infer_query(self, api_request: APIRequest) -> APIResponse:
                 "RLSAPI infer request URL: /api/lightspeed/%s/infer",
                 self.config.version,
             )
-            logger.debug("RLSAPI infer request body: %s", infer_request)
+            logger.debug(
+                "RLSAPI infer request: version=%s, include_metadata=%s, "
+                "question_length=%d",
+                self.config.version,
+                True,
+                len(str(infer_request.get("question", ""))),
+            )
 
             response = self.client.post(
                 f"/api/lightspeed/{self.config.version}/infer",
@@ -385,12 +399,12 @@ def _rlsapi_infer_query(self, api_request: APIRequest) -> APIResponse:
                     response_data["tool_calls"] = data["tool_calls"]
                 if "tool_results" in data:
                     tool_results = data["tool_results"]
+                    rag_chunks: list[dict[str, str]] = []
                     for result in tool_results:
                         if result.get("type") == "mcp_call":
                             content = result["content"].split("---")
-                            response_data["rag_chunks"] = [
-                                {"content": chunk} for chunk in content
-                            ]
+                            rag_chunks.extend([{"content": chunk} for chunk in content])
+                    response_data["rag_chunks"] = rag_chunks
 
             if "response" not in response_data:
                 raise APIError("API response missing 'response' field")
@@ -402,16 +416,8 @@ def _rlsapi_infer_query(self, api_request: APIRequest) -> APIResponse:
                 for tool_call in raw_tool_calls:
                     if isinstance(tool_call, dict):
                         formatted_tool: dict[str, object] = {
-                            "tool_name": (
-                                tool_call.get("tool_name")
-                                or tool_call.get("name")
-                                or ""
-                            ),
-                            "arguments": (
-                                tool_call.get("arguments")
-                                or tool_call.get("args")
-                                or {}
-                            ),
+                            "tool_name": tool_call.get("name", ""),
+                            "arguments": tool_call.get("args", {}),
                         }
                         if "tool_results" in response_data.get("data", {}):
                             tool_call_id = tool_call.get("id")
@@ -424,7 +430,12 @@ def _rlsapi_infer_query(self, api_request: APIRequest) -> APIResponse:
                                 None,
                             )
                             if matching_result:
-                                formatted_tool["result"] = matching_result["status"]
+                                formatted_tool["result"] = matching_result.get(
+                                    "content", matching_result.get("status", "")
+                                )
+                                formatted_tool["status"] = matching_result.get(
+                                    "status", ""
+                                )
                         formatted_tool_calls.append([formatted_tool])
 
                 response_data["tool_calls"] = formatted_tool_calls
diff --git a/src/lightspeed_evaluation/core/constants.py b/src/lightspeed_evaluation/core/constants.py
@@ -70,7 +70,7 @@
 DEFAULT_SSL_CERT_FILE = None
 DEFAULT_LLM_TEMPERATURE = 0.0
 DEFAULT_LLM_MAX_TOKENS = 512
-DEFAULT_LLM_RETRIES = 5
+DEFAULT_LLM_RETRIES = 3
 DEFAULT_LLM_CACHE_DIR = ".caches/llm_cache"
 
 DEFAULT_EMBEDDING_PROVIDER = "openai"
diff --git a/src/lightspeed_evaluation/core/models/system.py b/src/lightspeed_evaluation/core/models/system.py
@@ -271,7 +271,7 @@ class APIConfig(BaseModel):
     )
     endpoint_type: str = Field(
         default=DEFAULT_ENDPOINT_TYPE,
-        description="API endpoint type (streaming or query)",
+        description="API endpoint type (streaming, query, or infer)",
     )
     timeout: int = Field(
         default=DEFAULT_API_TIMEOUT, ge=1, description="Request timeout in seconds"
diff --git a/src/lightspeed_evaluation/core/system/validator.py b/src/lightspeed_evaluation/core/system/validator.py
@@ -159,6 +159,7 @@ def __init__(
         self.api_enabled = api_enabled
         self.original_data_path: Optional[str] = None
         self.fail_on_invalid_data = fail_on_invalid_data
+        self._system_config = system_config
         self._turn_level_metrics: set[str] = (
             system_config.turn_level_metric_names if system_config else set()
         )
@@ -235,15 +236,38 @@ def load_evaluation_data(
         # Remove skipped conversations
         evaluation_data = [e for e in evaluation_data if not e.skip]
 
-        # Filter turn_metrics if --metrics was specified
+        # Filter turn_metrics and conversation_metrics if --metrics was specified
         if metrics:
             metrics_set = set(metrics)
             for eval_data in evaluation_data:
                 for turn in eval_data.turns:
-                    if turn.turn_metrics:
+                    if turn.turn_metrics is not None:
                         turn.turn_metrics = [
                             m for m in turn.turn_metrics if m in metrics_set
                         ]
+                    elif self._system_config is not None:
+                        turn_defaults = (
+                            self._system_config.default_turn_metrics_metadata
+                        )
+                        turn.turn_metrics = [
+                            m
+                            for m, meta in turn_defaults.items()
+                            if meta.get("default", False) and m in metrics_set
+                        ]
+
+                if eval_data.conversation_metrics is not None:
+                    eval_data.conversation_metrics = [
+                        m for m in eval_data.conversation_metrics if m in metrics_set
+                    ]
+                elif self._system_config is not None:
+                    conv_defaults = (
+                        self._system_config.default_conversation_metrics_metadata
+                    )
+                    eval_data.conversation_metrics = [
+                        m
+                        for m, meta in conv_defaults.items()
+                        if meta.get("default", False) and m in metrics_set
+                    ]
 
         # Semantic validation (metrics availability and requirements)
         if not self._validate_evaluation_data(evaluation_data):
diff --git a/tests/unit/core/api/test_client.py b/tests/unit/core/api/test_client.py
@@ -697,7 +697,7 @@ def test_is_retryable_server_error(self, mocker: MockerFixture) -> None:
         )
 
         resp_500 = mocker.Mock(status_code=500)
-        assert _is_retryable_server_error(
+        assert not _is_retryable_server_error(
             httpx.HTTPStatusError("", request=mocker.Mock(), response=resp_500)
         )
 
@@ -814,23 +814,23 @@ def test_query_raises_api_error_after_max_retries(
 
         assert mock_client.post.call_count == 4  # 3 retries + 1 initial attempt
 
-    def test_standard_query_retries_on_500_then_succeeds(
+    def test_standard_query_retries_on_502_then_succeeds(
         self, basic_api_config_query_endpoint: APIConfig, mocker: MockerFixture
     ) -> None:
-        """Test standard query retries on 500 error and succeeds on retry."""
-        mock_response_500 = mocker.Mock(status_code=500, text="Internal server error")
-        mock_response_500.raise_for_status.side_effect = httpx.HTTPStatusError(
-            "500 error", request=mocker.Mock(), response=mock_response_500
+        """Test standard query retries on 502 error and succeeds on retry."""
+        mock_response_502 = mocker.Mock(status_code=502, text="Bad gateway")
+        mock_response_502.raise_for_status.side_effect = httpx.HTTPStatusError(
+            "502 error", request=mocker.Mock(), response=mock_response_502
         )
 
         mock_response_success = mocker.Mock(status_code=200)
         mock_response_success.json.return_value = {
-            "response": "Success after 500 retry",
+            "response": "Success after 502 retry",
             "conversation_id": "conv_123",
         }
 
         mock_client = mocker.Mock()
-        mock_client.post.side_effect = [mock_response_500, mock_response_success]
+        mock_client.post.side_effect = [mock_response_502, mock_response_success]
         mock_client.headers = {}
 
         mocker.patch(
@@ -841,7 +841,7 @@ def test_standard_query_retries_on_500_then_succeeds(
         client = APIClient(basic_api_config_query_endpoint)
         result = client.query("Test standard query")
 
-        assert result.response == "Success after 500 retry"
+        assert result.response == "Success after 502 retry"
         assert mock_client.post.call_count == 2
 
 
@@ -903,7 +903,7 @@ def test_infer_query_formats_tool_calls(
                         "name": "search_documentation",
                         "args": {"q": "rhel"},
                     },
-                    {"id": "tc2", "tool_name": "mcp_list_tools", "arguments": {}},
+                    {"id": "tc2", "name": "mcp_list_tools", "args": {}},
                 ],
                 "tool_results": [
                     {
@@ -938,9 +938,11 @@ def test_infer_query_formats_tool_calls(
         assert isinstance(result.tool_calls[0], list)
         assert result.tool_calls[0][0]["tool_name"] == "search_documentation"
         assert result.tool_calls[0][0]["arguments"] == {"q": "rhel"}
-        assert result.tool_calls[0][0]["result"] == "success"
+        assert result.tool_calls[0][0]["result"] == "result1"
+        assert result.tool_calls[0][0]["status"] == "success"
         assert result.tool_calls[1][0]["tool_name"] == "mcp_list_tools"
-        assert result.tool_calls[1][0]["result"] == "completed"
+        assert result.tool_calls[1][0]["result"] == "tools"
+        assert result.tool_calls[1][0]["status"] == "completed"
 
     def test_infer_query_extracts_rag_chunks(
         self, basic_api_config_infer_endpoint: APIConfig, mocker: MockerFixture
diff --git a/tests/unit/core/system/test_validator.py b/tests/unit/core/system/test_validator.py
diff --git a/tests/unit/runner/test_evaluation.py b/tests/unit/runner/test_evaluation.py

Original file line number	Diff line number	Diff line change
`@@ -271,7 +271,7 @@ class APIConfig(BaseModel):`
`271`	`271`	`)`
`272`	`272`	`endpoint_type: str = Field(`
`273`	`273`	`default=DEFAULT_ENDPOINT_TYPE,`
`274`		`- description="API endpoint type (streaming or query)",`
	`274`	`+ description="API endpoint type (streaming, query, or infer)",`
`275`	`275`	`)`
`276`	`276`	`timeout: int = Field(`
`277`	`277`	`default=DEFAULT_API_TIMEOUT, ge=1, description="Request timeout in seconds"`