opendatahub-io · adolfo-ab · May 12, 2025 · May 9, 2025 · May 12, 2025
@@ -14,6 +14,7 @@
 from ocp_resources.pod import Pod
 from ocp_resources.secret import Secret
 from ocp_resources.service import Service
+from ocp_utilities.monitoring import Prometheus
 from pyhelper_utils.shell import run_command
 from pytest import FixtureRequest, Config
 from kubernetes.dynamic import DynamicClient
@@ -23,6 +24,7 @@
 from pytest_testconfig import config as py_config
 from simple_logger.logger import get_logger
 
+from utilities.certificates_utils import create_ca_bundle_file
 from utilities.data_science_cluster_utils import update_components_in_dsc
 from utilities.exceptions import ClusterLoginError
 from utilities.infra import (
@@ -520,3 +522,15 @@ def cluster_sanity_scope_session(
         dsci_resource=dsci_resource,
         junitxml_property=junitxml_plugin,
     )
+
+
+@pytest.fixture(scope="session")
+def prometheus(admin_client: DynamicClient) -> Prometheus:
+    return Prometheus(
+        client=admin_client,
+        resource_name="thanos-querier",
+        verify_ssl=create_ca_bundle_file(
+            client=admin_client, ca_type="openshift"
+        ),  # TODO: Verify SSL with appropriate certs
+        bearer_token=get_openshift_token(),
+    )
@@ -1,3 +1,5 @@
+from functools import partial
+
 import pytest
 
 from tests.model_explainability.trustyai_service.constants import DRIFT_BASE_DATA_PATH
@@ -11,6 +13,7 @@
 )
 from utilities.constants import MinIo
 from utilities.manifests.openvino import OPENVINO_KSERVE_INFERENCE_CONFIG
+from utilities.monitoring import validate_metrics_field, get_metric_label
 
 
 @pytest.mark.parametrize(
@@ -103,6 +106,21 @@ def test_drift_metric_schedule_meanshift(
             },
         )
 
+    def test_drift_metric_prometheus(
+        self,
+        admin_client,
+        model_namespace,
+        trustyai_service_with_pvc_storage,
+        gaussian_credit_model,
+        prometheus,
+    ):
+        validate_metrics_field(
+            prometheus=prometheus,
+            metrics_query=f'trustyai_{TrustyAIServiceMetrics.Drift.MEANSHIFT}{{namespace="{model_namespace.name}"}}',
+            expected_value=TrustyAIServiceMetrics.Drift.MEANSHIFT.upper(),
+            field_getter=partial(get_metric_label, label_name="metricName"),
+        )
+
     def test_drift_metric_delete(
         self,
         admin_client,

@@ -1,3 +1,4 @@
+from functools import partial
 from typing import Any
 
 import pytest
@@ -13,6 +14,7 @@
 )
 from utilities.constants import MinIo
 from utilities.manifests.openvino import OPENVINO_KSERVE_INFERENCE_CONFIG
+from utilities.monitoring import validate_metrics_field, get_metric_label
 
 BASE_DATA_PATH: str = "./tests/model_explainability/trustyai_service/fairness/model_data"
 IS_MALE_IDENTIFYING: str = "Is Male-Identifying?"
@@ -115,6 +117,21 @@ def test_fairness_metric_schedule_spd_with_pvc_storage(
             json_data=get_fairness_request_json_data(isvc=onnx_loan_model),
         )
 
+    def test_fairness_metric_prometheus(
+        self,
+        admin_client,
+        model_namespace,
+        trustyai_service_with_pvc_storage,
+        onnx_loan_model,
+        prometheus,
+    ):
+        validate_metrics_field(
+            prometheus=prometheus,
+            metrics_query=f'trustyai_{TrustyAIServiceMetrics.Fairness.SPD}{{namespace="{model_namespace.name}"}}',
+            expected_value=TrustyAIServiceMetrics.Fairness.SPD.upper(),
+            field_getter=partial(get_metric_label, label_name="metricName"),
+        )
+
     def test_fairness_metric_delete_with_pvc_storage(
         self, admin_client, current_client_token, trustyai_service_with_pvc_storage, onnx_loan_model
     ):

@@ -13,7 +13,6 @@
 from ocp_resources.service_account import ServiceAccount
 from ocp_resources.serving_runtime import ServingRuntime
 from ocp_resources.storage_class import StorageClass
-from ocp_utilities.monitoring import Prometheus
 from pytest_testconfig import config as py_config
 from simple_logger.logger import get_logger
 
@@ -32,7 +31,6 @@
 )
 from utilities.inference_utils import create_isvc
 from utilities.infra import (
-    get_openshift_token,
     s3_endpoint_secret,
     update_configmap_data,
 )
@@ -394,16 +392,6 @@ def http_s3_tensorflow_model_mesh_inference_service(
         yield isvc
 
 
-@pytest.fixture(scope="session")
-def prometheus(admin_client: DynamicClient) -> Prometheus:
-    return Prometheus(
-        client=admin_client,
-        resource_name="thanos-querier",
-        verify_ssl=False,
-        bearer_token=get_openshift_token(),
-    )
-
-
 @pytest.fixture(scope="class")
 def user_workload_monitoring_config_map(
     admin_client: DynamicClient, cluster_monitoring_config: ConfigMap

@@ -14,7 +14,7 @@
 )
 from utilities.inference_utils import Inference
 from utilities.manifests.caikit_tgis import CAIKIT_TGIS_INFERENCE_CONFIG
-from utilities.monitoring import get_metrics_value, validate_metrics_value
+from utilities.monitoring import get_metrics_value, validate_metrics_field
 
 pytestmark = [
     pytest.mark.serverless,
@@ -57,7 +57,7 @@ def test_model_metrics_num_success_requests(self, s3_models_inference_service, p
             model_name=ModelFormat.CAIKIT,
             use_default_query=True,
         )
-        validate_metrics_value(
+        validate_metrics_field(
             prometheus=prometheus,
             metrics_query="tgi_request_success",
             expected_value="1",
@@ -78,7 +78,7 @@ def test_model_metrics_num_total_requests(self, s3_models_inference_service, pro
             iterations=total_runs,
             run_in_parallel=True,
         )
-        validate_metrics_value(
+        validate_metrics_field(
             prometheus=prometheus,
             metrics_query="tgi_request_count",
             expected_value=str(total_runs + 1),

@@ -7,7 +7,7 @@
 from utilities.constants import ModelFormat, ModelStoragePath, Protocols
 from utilities.inference_utils import Inference
 from utilities.manifests.caikit_tgis import CAIKIT_TGIS_INFERENCE_CONFIG
-from utilities.monitoring import validate_metrics_value
+from utilities.monitoring import validate_metrics_field
 
 
 @pytest.mark.parametrize(
@@ -67,7 +67,7 @@ def test_non_admin_raw_metrics(
             model_name=ModelFormat.CAIKIT,
             iterations=total_runs,
         )
-        validate_metrics_value(
+        validate_metrics_field(
             prometheus=prometheus,
             metrics_query="tgi_request_count",
             expected_value=str(total_runs),

@@ -1,4 +1,4 @@
-from typing import Any
+from typing import Any, Callable
 
 from ocp_resources.prometheus import Prometheus
 from simple_logger.logger import get_logger
@@ -7,53 +7,82 @@
 LOGGER = get_logger(name=__name__)
 
 
-def validate_metrics_value(
-    prometheus: Prometheus, metrics_query: str, expected_value: Any, timeout: int = 60 * 4
+def get_metrics_value(prometheus: Prometheus, metrics_query: str) -> Any:
+    """
+    Get metrics value from prometheus
+
+    Args:
+        prometheus (Prometheus): Prometheus object
+        metrics_query (str): Metrics query string
+
+    Returns:
+        Any: Metrics value
+
+    """
+    metric_results = prometheus.query_sampler(query=metrics_query)
+    if metric_values_list := [value for metric_val in metric_results for value in metric_val.get("value")]:
+        return metric_values_list[1]
+
+
+def get_metric_label(
+    prometheus: Prometheus,
+    metrics_query: str,
+    label_name: str,
+) -> Any:
+    """
+    Get the value of a specific label from the first matching metric.
+
+    Args:
+        prometheus (Prometheus): Prometheus object
+        metrics_query (str): Metrics query string
+        label_name (str): Label to retrieve
+
+    Returns:
+        Any: Value of the requested label, or None if not found
+    """
+    metric_results = prometheus.query_sampler(query=metrics_query)
+    LOGGER.info(f"Fields: {metric_results}")
+
+    if metric_results:
+        # Assume we care about the first result
+        return metric_results[0]["metric"].get(label_name)
+
+    return None
+
+
+def validate_metrics_field(
+    prometheus: Prometheus,
+    metrics_query: str,
+    expected_value: Any,
+    field_getter: Callable[..., Any] = get_metrics_value,
+    timeout: int = 60 * 4,
 ) -> None:
     """
-    Validate metrics value against expected value
+    Validate any metric field or label using a custom getter function.
+    Defaults to checking the metric's value if no getter is provided.
 
     Args:
         prometheus (Prometheus): Prometheus object
         metrics_query (str): Metrics query string
         expected_value (Any): Expected value
+        field_getter (Callable): Function to extract the desired field/label/value
         timeout (int): Timeout in seconds
 
     Raises:
-        TimeoutExpiredError: raised when expected conditions are not met within the timeframe
-
+        TimeoutExpiredError: If expected value isn't met within the timeout
     """
-    sample = None
     try:
         for sample in TimeoutSampler(
             wait_timeout=timeout,
             sleep=15,
-            func=get_metrics_value,
+            func=field_getter,
             prometheus=prometheus,
             metrics_query=metrics_query,
         ):
-            if sample:
-                LOGGER.info(f"metric: {metrics_query} value is: {sample}, the expected value is {expected_value}")
-                if sample == expected_value:
-                    LOGGER.info("Metrics value matches the expected value!")
-                    return
+            if sample == expected_value:
+                LOGGER.info("Metric field matches the expected value!")
+                return
+            LOGGER.info(f"Current value: {sample}, waiting for: {expected_value}")
     except TimeoutExpiredError:
-        LOGGER.info(f"Metrics value: {sample}, expected: {expected_value}")
+        LOGGER.error(f"Timed out. Last value: {sample}, expected: {expected_value}")
         raise
-
-
-def get_metrics_value(prometheus: Prometheus, metrics_query: str) -> Any:
-    """
-    Get metrics value from prometheus
-
-    Args:
-        prometheus (Prometheus): Prometheus object
-        metrics_query (str): Metrics query string
-
-    Returns:
-        Any: Metrics value
-
-    """
-    metric_results = prometheus.query_sampler(query=metrics_query)
-    if metric_values_list := [value for metric_val in metric_results for value in metric_val.get("value")]:
-        return metric_values_list[1]