test(model-validation): run raw and serverless execution in parallel (#693)

Snomaan6846 · web-flow · commit 6ca7af87f514 · 2025-10-16T12:41:40.000+05:30
Signed-off-by: Snomaan6846 &lt;syedali@redhat.com&gt;

rh-pre-commit.version: 2.3.2
rh-pre-commit.check-secrets: ENABLED
diff --git a/conftest.py b/conftest.py
@@ -280,9 +280,11 @@ def pytest_sessionstart(session: Session) -> None:
         pathlib.Path(tests_log_file).unlink()
     if session.config.getoption("--collect-must-gather"):
         session.config.option.must_gather_db = Database()
+    thread_name = os.environ.get("PYTEST_XDIST_WORKER", "master")
     session.config.option.log_listener = setup_logging(
         log_file=tests_log_file,
         log_level=session.config.getoption("log_cli_level") or logging.INFO,
+        thread_name=thread_name,
     )
     must_gather_dict = set_must_gather_collector_values()
     shutil.rmtree(
diff --git a/pytest.ini b/pytest.ini
@@ -7,6 +7,7 @@ markers =
     polarion: Store polarion test ID
     jira: Store jira bug ID
     skip_on_disconnected: Mark tests that can only be run in deployments with Internet access i.e. not on disconnected clusters.
+    parallel: marks tests that can run in parallel along with pytest-xdist
 
     # CI
     smoke: Mark tests as smoke tests; covers core functionality of the product. Aims to ensure that the build is stable enough for further testing.
diff --git a/tests/model_serving/model_runtime/model_validation/conftest.py b/tests/model_serving/model_runtime/model_validation/conftest.py
@@ -1,5 +1,5 @@
 import json
-from typing import Any, Generator
+from typing import Any, Generator, List
 
 import pytest
 import yaml
@@ -158,51 +158,89 @@ def deployment_config(request: FixtureRequest) -> dict[str, Any]:
 
 
 def build_raw_params(
-    name: str, image: str, args: list[str], gpu_count: int, model_output_type: str = "text"
+    name: str,
+    image: str,
+    args: list[str],
+    gpu_count: int,
+    execution_mode: str,
+    model_output_type: str = "text",
 ) -> tuple[Any, str]:
     test_id = f"{name}-raw"
+    deployment_type = KServeDeploymentType.RAW_DEPLOYMENT
     param = pytest.param(
         {"name": "raw-model-validation"},
-        {"deployment_type": KServeDeploymentType.RAW_DEPLOYMENT},
+        {"deployment_type": deployment_type},
         {
             "model_name": name,
             "model_car_image_uri": image,
         },
         {
-            "deployment_type": KServeDeploymentType.RAW_DEPLOYMENT,
+            "deployment_type": deployment_type,
             "runtime_argument": args,
             "gpu_count": gpu_count,
             "model_output_type": model_output_type,
         },
         id=test_id,
-        marks=[pytest.mark.rawdeployment],
+        marks=build_pytest_markers(deployment_type=deployment_type, execution_mode=execution_mode),
     )
     return param, test_id
 
 
 def build_serverless_params(
-    name: str, image: str, args: list[str], gpu_count: int, model_output_type: str = "text"
+    name: str,
+    image: str,
+    args: list[str],
+    gpu_count: int,
+    execution_mode: str,
+    model_output_type: str = "text",
 ) -> tuple[Any, str]:
     test_id = f"{name}-serverless"
+    deployment_type = KServeDeploymentType.SERVERLESS
     param = pytest.param(
         {"name": "serverless-model-validation"},
-        {"deployment_type": KServeDeploymentType.SERVERLESS},
+        {"deployment_type": deployment_type},
         {
             "model_name": name,
             "model_car_image_uri": image,
         },
         {
-            "deployment_type": KServeDeploymentType.SERVERLESS,
+            "deployment_type": deployment_type,
             "runtime_argument": args,
             "gpu_count": gpu_count,
             "model_output_type": model_output_type,
         },
         id=test_id,
-        marks=[pytest.mark.serverless],
+        marks=build_pytest_markers(deployment_type=deployment_type, execution_mode=execution_mode),
     )
     return param, test_id
 
 
+def build_pytest_markers(deployment_type: str, execution_mode: str) -> List[Any]:
+    """
+    Build a list of pytest markers based on deployment type, execution mode.
+
+    Args:
+        deployment_type (str): Deployment type (e.g., RAW_DEPLOYMENT, SERVERLESS)
+        execution_mode (str): "parallel" or "sequential"
+
+    Returns:
+        List[Any]: List of pytest.mark objects to attach to the test
+    """
+    markers: List[pytest.MarkDecorator] = []
+
+    if deployment_type == KServeDeploymentType.RAW_DEPLOYMENT:
+        markers.append(pytest.mark.rawdeployment)
+    elif deployment_type == KServeDeploymentType.SERVERLESS:
+        markers.append(pytest.mark.serverless)
+
+    # Execution mode markers
+    if execution_mode == "parallel":
+        markers.append(pytest.mark.parallel)
+        markers.append(pytest.mark.skip_must_gather)
+
+    return markers
+
+
 def pytest_generate_tests(metafunc: pytest.Metafunc) -> None:
     yaml_config = None
     yaml_path = metafunc.config.getoption(name="model_car_yaml_path")
@@ -232,6 +270,10 @@ def pytest_generate_tests(metafunc: pytest.Metafunc) -> None:
 
         name = model_car.get("name", "").strip()
         image = model_car.get("image", "").strip()
+        execution_mode = (
+            model_car.get("execution_mode", "").strip()
+            or default_serving_config.get("execution_mode", "sequential").strip()
+        )
 
         if not name or not image:
             continue
@@ -243,11 +285,21 @@ def pytest_generate_tests(metafunc: pytest.Metafunc) -> None:
 
         if metafunc.cls.__name__ == "TestVLLMModelCarRaw":
             param, test_id = build_raw_params(
-                name=name, image=image, args=args, gpu_count=gpu_count, model_output_type=model_output_type
+                name=name,
+                image=image,
+                args=args,
+                gpu_count=gpu_count,
+                execution_mode=execution_mode,
+                model_output_type=model_output_type,
             )
         elif metafunc.cls.__name__ == "TestVLLMModelCarServerless":
             param, test_id = build_serverless_params(
-                name=name, image=image, args=args, gpu_count=gpu_count, model_output_type=model_output_type
+                name=name,
+                image=image,
+                args=args,
+                gpu_count=gpu_count,
+                execution_mode=execution_mode,
+                model_output_type=model_output_type,
             )
         else:
             continue
diff --git a/tests/model_serving/model_runtime/model_validation/sample_modelcar_config.yaml b/tests/model_serving/model_runtime/model_validation/sample_modelcar_config.yaml
@@ -2,6 +2,7 @@ model-car:
   - name: granite-3.1-8b-base-quantized.w4a16
     image: oci://registry.redhat.io/rhelai1/modelcar-granite-3-1-8b-base-quantized-w4a16:1.5
     model_output_type: text
+    execution_mode: parallel
     serving_arguments:
       args:
         - "--uvicorn-log-level=info"
@@ -13,6 +14,7 @@ model-car:
   - name: whisper-large-v2-W4A16-G128
     image: oci://registry.redhat.io/rhelai1/modelcar-whisper-large-v2-w4a16-g128:1.5
     model_output_type: audio
+    execution_mode: parallel
     serving_arguments:
       args:
         - "--uvicorn-log-level=info"
@@ -23,14 +25,17 @@ model-car:
   - name: granite-3.1-8b-starter-v2
     image: oci://registry.redhat.io/rhelai1/modelcar-granite-3-1-8b-starter-v2:1.5
     model_output_type: text
+    execution_mode: sequential
 
   - name: granite-3.1-8b-instruct-quantized.w8a8
     image: oci://registry.redhat.io/rhelai1/modelcar-granite-3-1-8b-instruct-quantized-w8a8:1.5
     model_output_type: text
+    execution_mode: parallel
 
   - name: Llama-3.1-8B-Instruct
     image: oci://registry.redhat.io/rhelai1/modelcar-llama-3-1-8b-instruct:1.5
     model_output_type: text
+    execution_mode: sequential
     serving_arguments:
       args:
         - "--uvicorn-log-level=debug"
@@ -42,6 +47,7 @@ model-car:
   - name: Mistral-7B-Instruct-v0.3-quantized.w4a16
     image: oci://registry.redhat.io/rhelai1/modelcar-mistral-7b-instruct-v0-3-quantized-w4a16:1.5
     model_output_type: text
+    execution_mode: parallel
     serving_arguments:
       args:
         - "--uvicorn-log-level=debug"
@@ -53,11 +59,12 @@ model-car:
   - name: Qwen2.5-7B-Instruct-quantized.w8a8
     image: oci://registry.redhat.io/rhelai1/modelcar-qwen2-5-7b-instruct-quantized-w8a8:1.5
     model_output_type: text
-
+    execution_mode: parallel
 
   - name: Qwen2.5-7B-Instruct-FP8-dynamic
     image: oci://registry.redhat.io/rhelai1/modelcar-qwen2-5-7b-instruct-fp8-dynamic:1.5
     model_output_type: text
+    execution_mode: parallel
     serving_arguments:
       args:
         - "--uvicorn-log-level=debug"
@@ -69,6 +76,7 @@ model-car:
   - name: DeepSeek-R1-Distill-Llama-8B-FP8-dynamic
     image: oci://registry.redhat.io/rhelai1/modelcar-deepseek-r1-distill-llama-8b-fp8-dynamic:1.5
     model_output_type: text
+    execution_mode: parallel
     serving_arguments:
       args:
         - "--uvicorn-log-level=debug"
@@ -80,6 +88,7 @@ model-car:
   - name: phi-4-quantized.w4a16
     image: oci://registry.redhat.io/rhelai1/modelcar-phi-4-quantized-w4a16:1.5
     model_output_type: text
+    execution_mode: parallel
     serving_arguments:
       args:
         - "--uvicorn-log-level=debug"
@@ -91,6 +100,7 @@ model-car:
   - name: phi-4-quantized.w8a8
     image: oci://registry.redhat.io/rhelai1/modelcar-phi-4-quantized-w8a8:1.5
     model_output_type: text
+    execution_mode: sequential
     serving_arguments:
       args:
         - "--uvicorn-log-level=debug"
@@ -107,3 +117,4 @@ default:
         - "--trust-remote-code"
         - "--distributed-executor-backend=mp"
       gpu_count: 1
+    execution_mode: sequential
diff --git a/utilities/logger.py b/utilities/logger.py
@@ -21,14 +21,17 @@ def __repr__(self) -> str:
         return "'***REDACTED***'"
 
 
-def setup_logging(log_level: int, log_file: str = "/tmp/pytest-tests.log") -> QueueListener:
+def setup_logging(
+    log_level: int, log_file: str = "/tmp/pytest-tests.log", thread_name: str | None = None
+) -> QueueListener:
     """
     Setup basic/root logging using QueueHandler/QueueListener
     to consolidate log messages into a single stream to be written to multiple outputs.
 
     Args:
         log_level (int): log level
         log_file (str): logging output file
+        thread_name (str | None): optional thread_name id prefix, e.g., [gw0]
 
     Returns:
         QueueListener: Process monitoring the log Queue
@@ -38,9 +41,16 @@ def setup_logging(log_level: int, log_file: str = "/tmp/pytest-tests.log") -> Qu
                          ├> Queue -> QueueListener ┤
       basic QueueHandler ┘                         └> FileHandler
     """
-    basic_log_formatter = logging.Formatter(fmt="%(message)s")
+    basic_fmt_str = "%(message)s"
+    root_fmt_str = "%(asctime)s %(name)s %(log_color)s%(levelname)s%(reset)s %(message)s"
+
+    if thread_name:
+        basic_fmt_str = f"[{thread_name}] {basic_fmt_str}"
+        root_fmt_str = f"[{thread_name}] {root_fmt_str}"
+
+    basic_log_formatter = logging.Formatter(fmt=basic_fmt_str)
     root_log_formatter = WrapperLogFormatter(
-        fmt="%(asctime)s %(name)s %(log_color)s%(levelname)s%(reset)s %(message)s",
+        fmt=root_fmt_str,
         log_colors={
             "DEBUG": "cyan",
             "INFO": "green",
@@ -67,20 +77,28 @@ def setup_logging(log_level: int, log_file: str = "/tmp/pytest-tests.log") -> Qu
 
     basic_logger = logging.getLogger(name="basic")
     basic_logger.setLevel(level=log_level)
+    basic_logger.handlers.clear()
     basic_logger.addHandler(hdlr=basic_log_queue_handler)
 
     root_log_queue_handler = QueueHandler(queue=log_queue)
     root_log_queue_handler.set_name(name="root")
     root_log_queue_handler.setFormatter(fmt=root_log_formatter)
 
-    root_logger = logging.getLogger()
+    root_logger = logging.getLogger(name="root")
     root_logger.setLevel(level=log_level)
+    root_logger.handlers.clear()
     root_logger.addHandler(hdlr=root_log_queue_handler)
     root_logger.addFilter(filter=DuplicateFilter())
 
     root_logger.propagate = False
     basic_logger.propagate = False
 
+    for name, logger in logging.root.manager.loggerDict.items():
+        if isinstance(logger, logging.Logger) and (name not in ("root", "basic")):
+            logger.handlers.clear()
+            logger.addHandler(hdlr=root_log_queue_handler)
+            logger.propagate = False
+
     log_listener.start()
     return log_listener