Add GPU tests with vLLM runtime and Qwen model deployment for Guardrails (#1259)

Sandeep20013 · dbasunag · pre-commit-ci[bot] · jgarciao · commit ca8b9ca9c278 · 2026-04-02T18:17:46.000+02:00
* chore: branching 3.4ea1 and generating new tag (#1171) Signed-off-by: Sandeep20013 <sandeepm20013@gmail.com> * test(guardrails): add GPU Integration test using vLLM runtime Signed-off-by: Sandeep20013 <sandeepm20013@gmail.com> * update: include remaining changes Signed-off-by: Sandeep20013 <sandeepm20013@gmail.com> * chore: remove oc.tar Signed-off-by: Sandeep20013 <sandeepm20013@gmail.com> * sync Makefile with main branch Signed-off-by: Sandeep20013 <sandeepm20013@gmail.com> * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci Signed-off-by: Sandeep20013 <sandeepm20013@gmail.com> * Fix fixture issues / other suggested changes Signed-off-by: Sandeep20013 <sandeepm20013@gmail.com> * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci Signed-off-by: Sandeep20013 <sandeepm20013@gmail.com> * Trigger DCO check Signed-off-by: Sandeep20013 <sandeepm20013@gmail.com> * precommit.ci changes Signed-off-by: Sandeep20013 <sandeepm20013@gmail.com> * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci Signed-off-by: Sandeep20013 <sandeepm20013@gmail.com> * Refactor: move shared constants to utilities/constants.py and update guardrails tests Signed-off-by: Sandeep20013 <sandeepm20013@gmail.com> * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci * Unify GPU orchestrator config fixtures and update guardrails GPU tests Signed-off-by: Sandeep20013 <sandeepm20013@gmail.com> * Remove unnecessary guardrails_gateway_config and refactor GPU orchestrator config fixture Signed-off-by: Sandeep20013 <sandeepm20013@gmail.com> * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci * Added docstring for orchestrator_config_gpu Signed-off-by: Sandeep20013 <sandeepm20013@gmail.com> * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci * tests/fixtures/guardrails.py: replace early return with if/else in orchestrator_config_gpu fixture Signed-off-by: Sandeep20013 <sandeepm20013@gmail.com> * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci * fix E501 line too long flake8 violation Signed-off-by: Sandeep20013 <sandeepm20013@gmail.com> * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci * test(fixtures): simplify orchestrator_config_gpu docstring Signed-off-by: Sandeep20013 <sandeepm20013@gmail.com> --------- Signed-off-by: Sandeep20013 <sandeepm20013@gmail.com> Co-authored-by: Debarati Basu-Nag <dbasunag@redhat.com> Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com> Co-authored-by: Karishma Punwatkar <kpunwatk@redhat.com>
diff --git a/.gitignore b/.gitignore
@@ -178,3 +178,4 @@ QWEN.md
 
 # Must-Gather Artifacts
 must-gather-collected/
+oc.tar
diff --git a/tests/fixtures/guardrails.py b/tests/fixtures/guardrails.py
@@ -2,6 +2,7 @@
 from typing import Any
 
 import pytest
+import yaml
 from _pytest.fixtures import FixtureRequest
 from kubernetes.dynamic import DynamicClient
 from ocp_resources.config_map import ConfigMap
@@ -12,7 +13,14 @@
 from ocp_resources.resource import ResourceEditor
 from ocp_resources.route import Route
 
-from utilities.constants import Annotations, Labels
+from tests.fixtures.inference import get_vllm_chat_config
+from utilities.constants import (
+    BUILTIN_DETECTOR_CONFIG,
+    HAP_DETECTOR,
+    PROMPT_INJECTION_DETECTOR,
+    Annotations,
+    Labels,
+)
 from utilities.guardrails import check_guardrails_health_endpoint
 
 GUARDRAILS_ORCHESTRATOR_NAME: str = "guardrails-orchestrator"
@@ -46,6 +54,14 @@ def guardrails_orchestrator(
             orchestrator_config = request.getfixturevalue(argname="orchestrator_config")
             gorch_kwargs["orchestrator_config"] = orchestrator_config.name
 
+        elif request.param.get("orchestrator_config_gpu"):
+            orchestrator_config = request.getfixturevalue(argname="orchestrator_config_gpu")
+            gorch_kwargs["orchestrator_config"] = orchestrator_config.name
+
+        elif request.param.get("orchestrator_config_builtin_gpu"):
+            orchestrator_config = request.getfixturevalue(argname="orchestrator_config_builtin_gpu")
+            gorch_kwargs["orchestrator_config"] = orchestrator_config.name
+
         if request.param.get("enable_guardrails_gateway"):
             gorch_kwargs["enable_guardrails_gateway"] = True
 
@@ -209,3 +225,79 @@ def guardrails_orchestrator_gateway_route(
         wait_for_resource=True,
         ensure_exists=True,
     )
+
+
+@pytest.fixture(scope="class")
+def orchestrator_config_gpu(
+    request: FixtureRequest,
+    admin_client: DynamicClient,
+    model_namespace: Namespace,
+    teardown_resources: bool,
+    pytestconfig: pytest.Config,
+) -> Generator[ConfigMap, Any, Any]:
+    """
+    Creates the Guardrails Orchestrator ConfigMap for tests.
+
+    Builds configuration dynamically based on test parameters, supporting either
+    built-in detectors or external detector services. Reuses existing ConfigMap
+    during post-upgrade scenarios.
+    """
+    if pytestconfig.option.post_upgrade:
+        cm = ConfigMap(
+            client=admin_client,
+            name="fms-orchestr8-config-nlp",
+            namespace=model_namespace.name,
+            ensure_exists=True,
+        )
+        yield cm
+        cm.clean_up()
+
+    else:
+        param = getattr(request, "param", {}) or {}
+
+        if param and param.get("orchestrator_config_data"):
+            orchestrator_data = param["orchestrator_config_data"]
+
+        else:
+            # Decide detectors dynamically
+            if param and param.get("use_builtin_detectors"):
+                detectors = BUILTIN_DETECTOR_CONFIG
+            else:
+                detectors = {
+                    PROMPT_INJECTION_DETECTOR: {
+                        "type": "text_contents",
+                        "service": {
+                            "hostname": (
+                                f"{PROMPT_INJECTION_DETECTOR}-predictor.{model_namespace.name}.svc.cluster.local"
+                            ),
+                            "port": 80,
+                        },
+                        "chunker_id": "whole_doc_chunker",
+                        "default_threshold": 0.5,
+                    },
+                    HAP_DETECTOR: {
+                        "type": "text_contents",
+                        "service": {
+                            "hostname": f"{HAP_DETECTOR}-predictor.{model_namespace.name}.svc.cluster.local",
+                            "port": 80,
+                        },
+                        "chunker_id": "whole_doc_chunker",
+                        "default_threshold": 0.5,
+                    },
+                }
+
+            orchestrator_data = {
+                "config.yaml": yaml.dump({
+                    "openai": get_vllm_chat_config(model_namespace.name),
+                    "detectors": detectors,
+                })
+            }
+
+        with ConfigMap(
+            client=admin_client,
+            name="fms-orchestr8-config-nlp",
+            namespace=model_namespace.name,
+            data=orchestrator_data,
+            teardown=teardown_resources,
+        ) as cm:
+            yield cm
diff --git a/tests/fixtures/inference.py b/tests/fixtures/inference.py
@@ -21,6 +21,7 @@
     KServeDeploymentType,
     LLMdInferenceSimConfig,
     RuntimeTemplates,
+    VLLMGPUConfig,
 )
 from utilities.inference_utils import create_isvc
 from utilities.infra import get_data_science_cluster, wait_for_dsc_status_ready
@@ -245,3 +246,83 @@ def _wait_for_kserve_upgrade(dsc_resource: DataScienceCluster):
     else:
         LOGGER.info("DSC already configured for Headed mode")
         yield dsc
+
+
+@pytest.fixture(scope="class")
+def vllm_gpu_runtime(
+    admin_client: DynamicClient,
+    model_namespace: Namespace,
+) -> Generator[ServingRuntime, Any, Any]:
+
+    with ServingRuntimeFromTemplate(
+        client=admin_client,
+        name="vllm-runtime-gpu",
+        namespace=model_namespace.name,
+        template_name=RuntimeTemplates.VLLM_CUDA,
+        deployment_type=KServeDeploymentType.RAW_DEPLOYMENT,
+        runtime_image=(
+            "registry.redhat.io/rhaiis/vllm-cuda-rhel9@"
+            "sha256:ec799bb5eeb7e25b4b25a8917ab5161da6b6f1ab830cbba61bba371cffb0c34d"
+        ),
+        containers={
+            "kserve-container": {
+                "command": ["python", "-m", "vllm.entrypoints.openai.api_server"],
+                "args": [
+                    "--port=8080",
+                    "--model=/mnt/models",
+                    "--tokenizer=/mnt/models",
+                    "--served-model-name={{.Name}}",
+                    "--dtype=float16",
+                    "--enforce-eager",
+                ],
+                "ports": [{"containerPort": 8080, "protocol": "TCP"}],
+                "resources": {"limits": {"nvidia.com/gpu": "1"}},
+            }
+        },
+    ) as runtime:
+        yield runtime
+
+
+@pytest.fixture(scope="class")
+def qwen_gpu_isvc(
+    admin_client: DynamicClient,
+    model_namespace: Namespace,
+    vllm_gpu_runtime: ServingRuntime,
+) -> Generator[InferenceService, Any, Any]:
+
+    with create_isvc(
+        client=admin_client,
+        name="qwen3b",
+        namespace=model_namespace.name,
+        deployment_mode=KServeDeploymentType.RAW_DEPLOYMENT,
+        model_format="vLLM",
+        runtime=vllm_gpu_runtime.name,
+        storage_uri=(
+            "oci://quay.io/trustyai_testing/models/qwen2.5-3b-instruct@"
+            "sha256:6f9d9843599a9959de23c76d6b5adb556505482a7e732b2fcbca695a9c4ce545"
+        ),
+        enable_auth=False,
+        wait_for_predictor_pods=True,
+        resources={
+            "requests": {
+                "cpu": "2",
+                "memory": "8Gi",
+                "nvidia.com/gpu": "1",
+            },
+            "limits": {
+                "cpu": "4",
+                "memory": "12Gi",
+                "nvidia.com/gpu": "1",
+            },
+        },
+    ) as isvc:
+        yield isvc
+
+
+def get_vllm_chat_config(namespace: str) -> dict[str, Any]:
+    return {
+        "service": {
+            "hostname": VLLMGPUConfig.get_hostname(namespace),
+            "port": VLLMGPUConfig.port,
+        }
+    }
diff --git a/tests/model_explainability/guardrails/constants.py b/tests/model_explainability/guardrails/constants.py
@@ -12,8 +12,6 @@
 PII_ENDPOINT: str = "/pii"
 AUTOCONFIG_GATEWAY_ENDPOINT: str = "/all"
 STANDALONE_DETECTION_ENDPOINT: str = "api/v2/text/detection/content"
-PROMPT_INJECTION_DETECTOR: str = "prompt-injection-detector"
-HAP_DETECTOR: str = "hap-detector"
 
 
 @dataclass
@@ -38,6 +36,14 @@ class GuardrailsDetectionPrompt:
     detection_text=EXAMPLE_EMAIL_ADDRESS,
 )
 
+PII_OUTPUT_DETECTION_PROMPT_QWEN: GuardrailsDetectionPrompt = GuardrailsDetectionPrompt(
+    content="Give an example of a email address for a character in a story. Only return the email address string.",
+    detector_id="regex",
+    detection_name="email_address",
+    detection_type="pii",
+    detection_text="",
+)
+
 PII_OUTPUT_DETECTION_PROMPT: GuardrailsDetectionPrompt = GuardrailsDetectionPrompt(
     content="Give me one email and nothing else.",
     detector_id="regex",
diff --git a/tests/model_explainability/guardrails/test_guardrails.py b/tests/model_explainability/guardrails/test_guardrails.py
@@ -8,13 +8,11 @@
     AUTOCONFIG_DETECTOR_LABEL,
     AUTOCONFIG_GATEWAY_ENDPOINT,
     CHAT_COMPLETIONS_DETECTION_ENDPOINT,
-    HAP_DETECTOR,
     HAP_INPUT_DETECTION_PROMPT,
     HARMLESS_PROMPT,
     PII_ENDPOINT,
     PII_INPUT_DETECTION_PROMPT,
     PII_OUTPUT_DETECTION_PROMPT,
-    PROMPT_INJECTION_DETECTOR,
     PROMPT_INJECTION_INPUT_DETECTION_PROMPT,
     STANDALONE_DETECTION_ENDPOINT,
 )
@@ -29,7 +27,9 @@
 from tests.model_explainability.utils import validate_tai_component_images
 from utilities.constants import (
     BUILTIN_DETECTOR_CONFIG,
+    HAP_DETECTOR,
     LLM_D_CHAT_GENERATION_CONFIG,
+    PROMPT_INJECTION_DETECTOR,
     LLMdInferenceSimConfig,
     Timeout,
 )
diff --git a/tests/model_explainability/guardrails/test_guardrails_gpu.py b/tests/model_explainability/guardrails/test_guardrails_gpu.py
diff --git a/utilities/constants.py b/utilities/constants.py

Original file line number	Diff line number	Diff line change
`@@ -178,3 +178,4 @@ QWEN.md`
`178`	`178`
`179`	`179`	`# Must-Gather Artifacts`
`180`	`180`	`must-gather-collected/`
	`181`	`+oc.tar`