Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
14 changes: 14 additions & 0 deletions tests/conftest.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,7 @@
from ocp_resources.pod import Pod
from ocp_resources.secret import Secret
from ocp_resources.service import Service
from ocp_utilities.monitoring import Prometheus
from pyhelper_utils.shell import run_command
from pytest import FixtureRequest, Config
from kubernetes.dynamic import DynamicClient
Expand All @@ -23,6 +24,7 @@
from pytest_testconfig import config as py_config
from simple_logger.logger import get_logger

from utilities.certificates_utils import create_ca_bundle_file
from utilities.data_science_cluster_utils import update_components_in_dsc
from utilities.exceptions import ClusterLoginError
from utilities.infra import (
Expand Down Expand Up @@ -520,3 +522,15 @@ def cluster_sanity_scope_session(
dsci_resource=dsci_resource,
junitxml_property=junitxml_plugin,
)


@pytest.fixture(scope="session")
def prometheus(admin_client: DynamicClient) -> Prometheus:
return Prometheus(
client=admin_client,
resource_name="thanos-querier",
verify_ssl=create_ca_bundle_file(
client=admin_client, ca_type="openshift"
), # TODO: Verify SSL with appropriate certs
bearer_token=get_openshift_token(),
)
18 changes: 18 additions & 0 deletions tests/model_explainability/trustyai_service/drift/test_drift.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,5 @@
from functools import partial

import pytest

from tests.model_explainability.trustyai_service.constants import DRIFT_BASE_DATA_PATH
Expand All @@ -11,6 +13,7 @@
)
from utilities.constants import MinIo
from utilities.manifests.openvino import OPENVINO_KSERVE_INFERENCE_CONFIG
from utilities.monitoring import validate_metrics_field, get_metric_label


@pytest.mark.parametrize(
Expand Down Expand Up @@ -103,6 +106,21 @@ def test_drift_metric_schedule_meanshift(
},
)

def test_drift_metric_prometheus(
self,
admin_client,
model_namespace,
trustyai_service_with_pvc_storage,
gaussian_credit_model,
prometheus,
):
validate_metrics_field(
prometheus=prometheus,
metrics_query=f'trustyai_{TrustyAIServiceMetrics.Drift.MEANSHIFT}{{namespace="{model_namespace.name}"}}',
expected_value=TrustyAIServiceMetrics.Drift.MEANSHIFT.upper(),
field_getter=partial(get_metric_label, label_name="metricName"),
)

def test_drift_metric_delete(
self,
admin_client,
Expand Down
Original file line number Diff line number Diff line change
@@ -1,3 +1,4 @@
from functools import partial
from typing import Any

import pytest
Expand All @@ -13,6 +14,7 @@
)
from utilities.constants import MinIo
from utilities.manifests.openvino import OPENVINO_KSERVE_INFERENCE_CONFIG
from utilities.monitoring import validate_metrics_field, get_metric_label

BASE_DATA_PATH: str = "./tests/model_explainability/trustyai_service/fairness/model_data"
IS_MALE_IDENTIFYING: str = "Is Male-Identifying?"
Expand Down Expand Up @@ -115,6 +117,21 @@ def test_fairness_metric_schedule_spd_with_pvc_storage(
json_data=get_fairness_request_json_data(isvc=onnx_loan_model),
)

def test_fairness_metric_prometheus(
self,
admin_client,
model_namespace,
trustyai_service_with_pvc_storage,
onnx_loan_model,
prometheus,
):
validate_metrics_field(
prometheus=prometheus,
metrics_query=f'trustyai_{TrustyAIServiceMetrics.Fairness.SPD}{{namespace="{model_namespace.name}"}}',
expected_value=TrustyAIServiceMetrics.Fairness.SPD.upper(),
field_getter=partial(get_metric_label, label_name="metricName"),
)

def test_fairness_metric_delete_with_pvc_storage(
self, admin_client, current_client_token, trustyai_service_with_pvc_storage, onnx_loan_model
):
Expand Down
12 changes: 0 additions & 12 deletions tests/model_serving/model_server/conftest.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,6 @@
from ocp_resources.service_account import ServiceAccount
from ocp_resources.serving_runtime import ServingRuntime
from ocp_resources.storage_class import StorageClass
from ocp_utilities.monitoring import Prometheus
from pytest_testconfig import config as py_config
from simple_logger.logger import get_logger

Expand All @@ -32,7 +31,6 @@
)
from utilities.inference_utils import create_isvc
from utilities.infra import (
get_openshift_token,
s3_endpoint_secret,
update_configmap_data,
)
Expand Down Expand Up @@ -394,16 +392,6 @@ def http_s3_tensorflow_model_mesh_inference_service(
yield isvc


@pytest.fixture(scope="session")
def prometheus(admin_client: DynamicClient) -> Prometheus:
return Prometheus(
client=admin_client,
resource_name="thanos-querier",
verify_ssl=False,
bearer_token=get_openshift_token(),
)


@pytest.fixture(scope="class")
def user_workload_monitoring_config_map(
admin_client: DynamicClient, cluster_monitoring_config: ConfigMap
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,7 @@
)
from utilities.inference_utils import Inference
from utilities.manifests.caikit_tgis import CAIKIT_TGIS_INFERENCE_CONFIG
from utilities.monitoring import get_metrics_value, validate_metrics_value
from utilities.monitoring import get_metrics_value, validate_metrics_field

pytestmark = [
pytest.mark.serverless,
Expand Down Expand Up @@ -57,7 +57,7 @@ def test_model_metrics_num_success_requests(self, s3_models_inference_service, p
model_name=ModelFormat.CAIKIT,
use_default_query=True,
)
validate_metrics_value(
validate_metrics_field(
prometheus=prometheus,
metrics_query="tgi_request_success",
expected_value="1",
Expand All @@ -78,7 +78,7 @@ def test_model_metrics_num_total_requests(self, s3_models_inference_service, pro
iterations=total_runs,
run_in_parallel=True,
)
validate_metrics_value(
validate_metrics_field(
prometheus=prometheus,
metrics_query="tgi_request_count",
expected_value=str(total_runs + 1),
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,7 @@
from utilities.constants import ModelFormat, ModelStoragePath, Protocols
from utilities.inference_utils import Inference
from utilities.manifests.caikit_tgis import CAIKIT_TGIS_INFERENCE_CONFIG
from utilities.monitoring import validate_metrics_value
from utilities.monitoring import validate_metrics_field


@pytest.mark.parametrize(
Expand Down Expand Up @@ -67,7 +67,7 @@ def test_non_admin_raw_metrics(
model_name=ModelFormat.CAIKIT,
iterations=total_runs,
)
validate_metrics_value(
validate_metrics_field(
prometheus=prometheus,
metrics_query="tgi_request_count",
expected_value=str(total_runs),
Expand Down
91 changes: 60 additions & 31 deletions utilities/monitoring.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
from typing import Any
from typing import Any, Callable

from ocp_resources.prometheus import Prometheus
from simple_logger.logger import get_logger
Expand All @@ -7,53 +7,82 @@
LOGGER = get_logger(name=__name__)


def validate_metrics_value(
prometheus: Prometheus, metrics_query: str, expected_value: Any, timeout: int = 60 * 4
def get_metrics_value(prometheus: Prometheus, metrics_query: str) -> Any:
"""
Get metrics value from prometheus

Args:
prometheus (Prometheus): Prometheus object
metrics_query (str): Metrics query string

Returns:
Any: Metrics value

"""
metric_results = prometheus.query_sampler(query=metrics_query)
if metric_values_list := [value for metric_val in metric_results for value in metric_val.get("value")]:
return metric_values_list[1]


def get_metric_label(
prometheus: Prometheus,
metrics_query: str,
label_name: str,
) -> Any:
"""
Get the value of a specific label from the first matching metric.

Args:
prometheus (Prometheus): Prometheus object
metrics_query (str): Metrics query string
label_name (str): Label to retrieve

Returns:
Any: Value of the requested label, or None if not found
"""
metric_results = prometheus.query_sampler(query=metrics_query)
LOGGER.info(f"Fields: {metric_results}")

if metric_results:
# Assume we care about the first result
return metric_results[0]["metric"].get(label_name)

return None


def validate_metrics_field(
prometheus: Prometheus,
metrics_query: str,
expected_value: Any,
field_getter: Callable[..., Any] = get_metrics_value,
timeout: int = 60 * 4,
) -> None:
"""
Validate metrics value against expected value
Validate any metric field or label using a custom getter function.
Defaults to checking the metric's value if no getter is provided.

Args:
prometheus (Prometheus): Prometheus object
metrics_query (str): Metrics query string
expected_value (Any): Expected value
field_getter (Callable): Function to extract the desired field/label/value
timeout (int): Timeout in seconds

Raises:
TimeoutExpiredError: raised when expected conditions are not met within the timeframe

TimeoutExpiredError: If expected value isn't met within the timeout
"""
sample = None
try:
for sample in TimeoutSampler(
wait_timeout=timeout,
sleep=15,
func=get_metrics_value,
func=field_getter,
prometheus=prometheus,
metrics_query=metrics_query,
):
if sample:
LOGGER.info(f"metric: {metrics_query} value is: {sample}, the expected value is {expected_value}")
if sample == expected_value:
LOGGER.info("Metrics value matches the expected value!")
return
if sample == expected_value:
LOGGER.info("Metric field matches the expected value!")
return
LOGGER.info(f"Current value: {sample}, waiting for: {expected_value}")
except TimeoutExpiredError:
LOGGER.info(f"Metrics value: {sample}, expected: {expected_value}")
LOGGER.error(f"Timed out. Last value: {sample}, expected: {expected_value}")
raise


def get_metrics_value(prometheus: Prometheus, metrics_query: str) -> Any:
"""
Get metrics value from prometheus

Args:
prometheus (Prometheus): Prometheus object
metrics_query (str): Metrics query string

Returns:
Any: Metrics value

"""
metric_results = prometheus.query_sampler(query=metrics_query)
if metric_values_list := [value for metric_val in metric_results for value in metric_val.get("value")]:
return metric_values_list[1]