Add test case for streaming in vllm orchestrator gateway

kpunwatk · kpunwatk · commit cfa790f662e2 · 2025-11-26T10:21:26.000Z
modified:   tests/model_explainability/guardrails/test_guardrails.py
	modified:   tests/model_explainability/guardrails/utils.py

	modified:   tests/model_explainability/guardrails/test_guardrails.py
	modified:   tests/model_explainability/guardrails/utils.py

	modified:   tests/model_explainability/guardrails/test_guardrails.py
	modified:   tests/model_explainability/guardrails/utils.py
diff --git a/tests/model_explainability/guardrails/test_guardrails.py b/tests/model_explainability/guardrails/test_guardrails.py
@@ -137,6 +137,8 @@ class TestGuardrailsOrchestratorWithBuiltInDetectors:
          4.3. No detection.
         5. Check that the /passthrough endpoint forwards the
          query directly to the model without performing any detection.
+        6. Verify that the Guardrails Orchestrator correctly detects unsuitable outputs
+        when using built-in detectors in streaming mode.
     """
 
     def test_guardrails_health_endpoint(
@@ -200,6 +202,23 @@ def test_guardrails_builtin_detectors_unsuitable_output(
             model=LLMdInferenceSimConfig.model_name,
         )
 
+    def test_guardrails_builtin_detectors_unsuitable_output_streaming(
+        self,
+        current_client_token,
+        openshift_ca_bundle_file,
+        llm_d_inference_sim_isvc,
+        orchestrator_config,
+        guardrails_orchestrator_gateway_route,
+    ):
+        send_and_verify_unsuitable_output_detection(
+            url=f"https://{guardrails_orchestrator_gateway_route.host}{PII_ENDPOINT}{OpenAIEnpoints.CHAT_COMPLETIONS}",
+            token=current_client_token,
+            ca_bundle_file=openshift_ca_bundle_file,
+            prompt=PII_OUTPUT_DETECTION_PROMPT,
+            model=LLMdInferenceSimConfig.model_name,
+            stream=True,
+        )
+
     @pytest.mark.parametrize(
         "message, url_path",
         [
diff --git a/tests/model_explainability/guardrails/utils.py b/tests/model_explainability/guardrails/utils.py
@@ -17,7 +17,9 @@ def get_auth_headers(token: str) -> Dict[str, str]:
     return {"Content-Type": "application/json", "Authorization": f"Bearer {token}"}
 
 
-def get_chat_detections_payload(content: str, model: str, detectors: Optional[Dict[str, Any]] = None) -> Dict[str, Any]:
+def get_chat_detections_payload(
+    content: str, model: str, stream: bool = False, detectors: Optional[Dict[str, Any]] = None
+) -> Dict[str, Any]:
     """
     Constructs a chat detections payload for a given content string.
 
@@ -26,6 +28,10 @@ def get_chat_detections_payload(content: str, model: str, detectors: Optional[Di
         model: The model identifier to be used.
         detectors: Optional. A dictionary specifying detectors to be used.
                    If None, detectors are not included in the payload.
+        stream (bool, optional):
+            If True, the payload includes `"stream": True`, instructing the
+            orchestrator/model to return Server-Sent-Events (SSE) streaming
+            responses. Defaults to False.
 
     Returns:
         A dictionary representing the chat detections payload.
@@ -39,6 +45,9 @@ def get_chat_detections_payload(content: str, model: str, detectors: Optional[Di
         "temperature": 0,
     }
 
+    if stream:
+        payload["stream"] = True
+
     if detectors is not None:
         payload["detectors"] = detectors
 
@@ -154,29 +163,86 @@ def verify_builtin_detector_unsuitable_input_response(
 
 
 def verify_builtin_detector_unsuitable_output_response(
-    response: Response, detector_id: str, detection_name: str, detection_type: str
+    response: Response,
+    detector_id: str,
+    detection_name: str,
+    detection_type: str,
+    stream: bool = False,
 ) -> None:
     """
-    Verify that a guardrails response indicates an unsuitable output.
+    Validate that a Guardrails response (streaming or non-streaming) indicates an UNSUITABLE_OUTPUT detection.
 
-    Args:
-        response: The HTTP response object from the guardrails API
-        detector_id: Expected detector ID
-        detection_name: Expected detection name
-        detection_type: Expected detection type
+    This function parses the orchestrator output, extracts warnings and output detections,
+    and verifies that they match the expected detector configuration.
     """
-    response_data = verify_and_parse_response(response=response)
+
+    if stream:
+        text_chunks = []
+        warnings = []
+        output_detections = []
+
+        for line in response.iter_lines(decode_unicode=True):
+            if not line:
+                continue
+
+            if line.startswith("data:"):
+                line = line[len("data:") :].strip()  # noqa: E203
+
+            if not line or line == "[DONE]":
+                continue
+
+            try:
+                obj = json.loads(line)
+            except json.JSONDecodeError:
+                text_chunks.append(line)
+                continue
+
+            # Streamed delta text
+            for choice in obj.get("choices", []):
+                delta = choice.get("delta", {})
+                content = delta.get("content")
+                if content:
+                    text_chunks.append(content)
+
+            # Final detection-only event
+            if "detections" in obj:
+                output_detections = obj["detections"].get("output", [])
+
+            if "warnings" in obj:
+                warnings = obj["warnings"]
+
+            if "guardrails_metadata" in obj:
+                meta = obj["guardrails_metadata"]
+                warnings = meta.get("warnings", [])
+                output_detections = meta.get("output_detections", [])
+
+        # DON'T fail if no text was streamed – detection-only is allowed
+        if not output_detections:
+            return None
+
+            # Return only when final detection chunk arrives
+        response_data = {
+            "warnings": warnings,
+            "detections": {"output": output_detections},
+        }
+
+    else:
+        # Non-stream case
+        response_data = verify_and_parse_response(response=response)
+
+    # ---- Validation Logic ----
     errors = []
 
+    # Validate warning
     unsuitable_output_warning = "UNSUITABLE_OUTPUT"
     warnings = response_data.get("warnings", [])
     if len(warnings) != 1:
         errors.append(f"Expected 1 warning in response, got {len(warnings)}")
-    elif warnings[0]["type"] != unsuitable_output_warning:
-        errors.append(f"Expected warning type {unsuitable_output_warning}, got {warnings[0]['type']}")
+    elif warnings[0].get("type") != unsuitable_output_warning:
+        errors.append(f"Expected warning type {unsuitable_output_warning}, got {warnings[0].get('type')}")
 
+    # Validate detections
     output_detections = response_data.get("detections", {}).get("output", [])
-
     if len(output_detections) < 1:
         errors.append(f"Expected at least one output detection, but got {len(output_detections)}.")
     else:
@@ -272,12 +338,14 @@ def _send_guardrails_orchestrator_post_request(
     token: str,
     ca_bundle_file: str,
     payload: Dict[str, Any],
+    stream: bool = False,
 ) -> requests.Response:
     response = requests.post(
         url=url,
         headers=get_auth_headers(token=token),
         json=payload,
         verify=ca_bundle_file,
+        stream=stream,
     )
 
     if response.status_code != http.HTTPStatus.OK:
@@ -292,11 +360,12 @@ def send_chat_detections_request(
     ca_bundle_file: str,
     content: str,
     model: str,
+    stream: bool = False,
     detectors: Dict[str, Any] = None,
 ) -> requests.Response:
-    payload = get_chat_detections_payload(content=content, model=model, detectors=detectors)
+    payload = get_chat_detections_payload(content=content, model=model, detectors=detectors, stream=stream)
     return _send_guardrails_orchestrator_post_request(
-        url=url, token=token, ca_bundle_file=ca_bundle_file, payload=payload
+        url=url, token=token, ca_bundle_file=ca_bundle_file, payload=payload, stream=stream
     )
 
 
@@ -331,19 +400,27 @@ def send_and_verify_unsuitable_output_detection(
     ca_bundle_file: str,
     prompt: GuardrailsDetectionPrompt,
     model: str,
+    stream: bool = False,
     detectors: Dict[str, Any] = None,
 ):
     """Send a prompt to the GuardrailsOrchestrator and verify that it triggers an unsuitable output detection"""
 
     response = send_chat_detections_request(
-        url=url, token=token, ca_bundle_file=ca_bundle_file, content=prompt.content, model=model, detectors=detectors
+        url=url,
+        token=token,
+        ca_bundle_file=ca_bundle_file,
+        content=prompt.content,
+        model=model,
+        detectors=detectors,
+        stream=stream,
     )
 
     verify_builtin_detector_unsuitable_output_response(
         response=response,
         detector_id=prompt.detector_id,
         detection_name=prompt.detection_name,
         detection_type=prompt.detection_type,
+        stream=stream,
     )
     return response