Add test to validate filter_options api response against DB query (#748)

lugi0 · web-flow · commit 125893f34e60 · 2025-10-24T10:09:15.000-04:00
* feat: add test to validate api response against DB query

Signed-off-by: lugi0 &lt;lgiorgi@redhat.com&gt;

* fix: resolve merge conflict

Signed-off-by: lugi0 &lt;lgiorgi@redhat.com&gt;

* fix: add xfail marker to prevent red failure

Signed-off-by: lugi0 &lt;lgiorgi@redhat.com&gt;

* fix: add small comment

Signed-off-by: lugi0 &lt;lgiorgi@redhat.com&gt;

* fix: add additional jira id for xfail

Signed-off-by: lugi0 &lt;lgiorgi@redhat.com&gt;

* fix: move db constants to their own file

Signed-off-by: lugi0 &lt;lgiorgi@redhat.com&gt;

* fix: push untracked change

Signed-off-by: lugi0 &lt;lgiorgi@redhat.com&gt;

* fix: fail if db pod not found with label

Signed-off-by: lugi0 &lt;lgiorgi@redhat.com&gt;

---------

Signed-off-by: lugi0 &lt;lgiorgi@redhat.com&gt;
diff --git a/tests/model_registry/model_catalog/db_constants.py b/tests/model_registry/model_catalog/db_constants.py
@@ -0,0 +1,40 @@
+# Constants useful for querying the model catalog database and parsing its responses
+
+# SQL query for filter_options endpoint database validation
+# Replicates the exact database query used by GetFilterableProperties for the filter_options endpoint
+# in kubeflow/model-registry catalog/internal/db/service/catalog_model.go
+# Note: Uses dynamic type_id lookup via 'kf.CatalogModel' name since type_id appears to be dynamic
+FILTER_OPTIONS_DB_QUERY = """
+SELECT name, array_agg(string_value) FROM (
+    SELECT
+        name,
+        string_value
+    FROM "ContextProperty" WHERE
+        context_id IN (
+            SELECT id FROM "Context" WHERE type_id = (
+                SELECT id FROM "Type" WHERE name = 'kf.CatalogModel'
+            )
+        )
+        AND string_value IS NOT NULL
+        AND string_value != ''
+        AND string_value IS NOT JSON ARRAY
+
+    UNION
+
+    SELECT
+        name,
+        json_array_elements_text(string_value::json) AS string_value
+    FROM "ContextProperty" WHERE
+        context_id IN (
+            SELECT id FROM "Context" WHERE type_id = (
+                SELECT id FROM "Type" WHERE name = 'kf.CatalogModel'
+            )
+        )
+        AND string_value IS JSON ARRAY
+)
+GROUP BY name HAVING MAX(CHAR_LENGTH(string_value)) <= 100;
+"""
+
+# Fields that are explicitly filtered out by the filter_options endpoint API
+# From db_catalog.go:204-206 in kubeflow/model-registry GetFilterOptions method
+API_EXCLUDED_FILTER_FIELDS = {"source_id", "logo", "license_link"}
diff --git a/tests/model_registry/model_catalog/test_filter_options_endpoint.py b/tests/model_registry/model_catalog/test_filter_options_endpoint.py
@@ -2,7 +2,13 @@
 from typing import Self
 from simple_logger.logger import get_logger
 
-from tests.model_registry.model_catalog.utils import validate_filter_options_structure
+from tests.model_registry.model_catalog.utils import (
+    validate_filter_options_structure,
+    parse_psql_array_agg_output,
+    get_postgres_pod_in_namespace,
+    compare_filter_options_with_database,
+)
+from tests.model_registry.model_catalog.db_constants import FILTER_OPTIONS_DB_QUERY, API_EXCLUDED_FILTER_FIELDS
 from tests.model_registry.utils import get_rest_headers, execute_get_command
 from utilities.user_utils import UserTestSession
 
@@ -13,30 +19,30 @@
 ]
 
 
-@pytest.mark.parametrize(
-    "user_token_for_api_calls,",
-    [
-        pytest.param(
-            {},
-            id="test_filter_options_admin_user",
-        ),
-        pytest.param(
-            {"user_type": "test"},
-            id="test_filter_options_non_admin_user",
-        ),
-        pytest.param(
-            {"user_type": "sa_user"},
-            id="test_filter_options_service_account",
-        ),
-    ],
-    indirect=["user_token_for_api_calls"],
-)
 class TestFilterOptionsEndpoint:
     """
     Test class for validating the models/filter_options endpoint
     RHOAIENG-36696
     """
 
+    @pytest.mark.parametrize(
+        "user_token_for_api_calls,",
+        [
+            pytest.param(
+                {},
+                id="test_filter_options_admin_user",
+            ),
+            pytest.param(
+                {"user_type": "test"},
+                id="test_filter_options_non_admin_user",
+            ),
+            pytest.param(
+                {"user_type": "sa_user"},
+                id="test_filter_options_service_account",
+            ),
+        ],
+        indirect=["user_token_for_api_calls"],
+    )
     def test_filter_options_endpoint_validation(
         self: Self,
         model_catalog_rest_url: list[str],
@@ -74,48 +80,67 @@ def test_filter_options_endpoint_validation(
         LOGGER.info(f"Found {len(filters)} filter properties: {list(filters.keys())}")
         LOGGER.info("All filter options validation passed successfully")
 
-    @pytest.mark.skip(reason="TODO: Implement after investigating backend DB queries")
+    # Cannot use non-admin user for this test as it cannot list the pods in the namespace
+    @pytest.mark.parametrize(
+        "user_token_for_api_calls,",
+        [
+            pytest.param(
+                {},
+                id="test_filter_options_admin_user",
+            ),
+            pytest.param(
+                {"user_type": "sa_user"},
+                id="test_filter_options_service_account",
+            ),
+        ],
+        indirect=["user_token_for_api_calls"],
+    )
+    @pytest.mark.xfail(strict=True, reason="RHOAIENG-37069: backend/API discrepancy expected")
     def test_comprehensive_coverage_against_database(
         self: Self,
         model_catalog_rest_url: list[str],
         user_token_for_api_calls: str,
-        test_idp_user: UserTestSession,
+        model_registry_namespace: str,
     ):
         """
-        STUBBED: Validate filter options are comprehensive across all sources/models in DB.
+        Validate filter options are comprehensive across all sources/models in DB.
         Acceptance Criteria: The returned options are comprehensive and not limited to a
         subset of models or a single source.
 
-        TODO IMPLEMENTATION PLAN:
-        1. Investigate backend endpoint logic:
-           - Find the source code for /models/filter_options endpoint in kubeflow/model-registry
-           - Understand what DB tables it queries (likely model/artifact tables)
-           - Identify the exact SQL queries used to build filter values
-           - Determine database schema and column names
-
-        2. Replicate queries via pod shell:
-           - Use get_model_catalog_pod() to access catalog pod
-           - Execute psql commands via pod.execute()
-           - Query same tables/columns the endpoint uses
-           - Extract all distinct values for string properties: SELECT DISTINCT license FROM models;
-           - Extract min/max ranges for numeric properties: SELECT MIN(metric), MAX(metric) FROM models;
-
-        3. Compare results:
-           - API response filter values should match DB query results exactly
-           - Ensure no values are missing (comprehensive coverage)
-           - Validate across all sources, not just one
-
-        4. DB Access Pattern Example:
-           catalog_pod = get_model_catalog_pod(client, namespace)[0]
-           result = catalog_pod.execute(
-               command=["psql", "-U", "catalog_user", "-d", "catalog_db", "-c", "SELECT DISTINCT license FROM models;"],
-               container="catalog"
-           )
-
-        5. Implementation considerations:
-           - Handle different data types (strings vs arrays like tasks)
-           - Parse psql output correctly
-           - Handle null/empty values
-           - Ensure database connection credentials are available
+        This test executes the exact same SQL query the API uses and compares results
+        to catch any discrepancies between database content and API response.
+
+        Expected failure because of RHOAIENG-37069 & RHOAIENG-37226
         """
-        pytest.skip("TODO: Implement comprehensive coverage validation after backend investigation")
+        api_url = f"{model_catalog_rest_url[0]}models/filter_options"
+        LOGGER.info(f"Testing comprehensive database coverage for: {api_url}")
+
+        api_response = execute_get_command(
+            url=api_url,
+            headers=get_rest_headers(token=user_token_for_api_calls),
+        )
+
+        api_filters = api_response["filters"]
+        LOGGER.info(f"API returned {len(api_filters)} filter properties: {list(api_filters.keys())}")
+
+        postgres_pod = get_postgres_pod_in_namespace(namespace=model_registry_namespace)
+        LOGGER.info(f"Using PostgreSQL pod: {postgres_pod.name}")
+
+        db_result = postgres_pod.execute(
+            command=["psql", "-U", "catalog_user", "-d", "model_catalog", "-c", FILTER_OPTIONS_DB_QUERY],
+            container="postgresql",
+        )
+
+        db_properties = parse_psql_array_agg_output(psql_output=db_result)
+        LOGGER.info(f"Raw database query returned {len(db_properties)} properties: {list(db_properties.keys())}")
+
+        is_valid, comparison_errors = compare_filter_options_with_database(
+            api_filters=api_filters, db_properties=db_properties, excluded_fields=API_EXCLUDED_FILTER_FIELDS
+        )
+
+        if not is_valid:
+            failure_msg = "Filter options API response does not match database content"
+            failure_msg += "\nDetailed comparison errors:\n" + "\n".join(comparison_errors)
+            assert False, failure_msg
+
+        LOGGER.info("Comprehensive database coverage validation passed - API matches database exactly")
diff --git a/tests/model_registry/model_catalog/utils.py b/tests/model_registry/model_catalog/utils.py
@@ -244,6 +244,140 @@ def validate_model_catalog_configmap_data(configmap: ConfigMap, num_catalogs: in
         validate_default_catalog(catalogs=catalogs)
 
 
+def parse_psql_array_agg_output(psql_output: str) -> dict[str, list[str]]:
+    """
+    Parse psql output from array_agg query into Python dict.
+
+    Expected format:
+        name     | array_agg
+        ---------+----------
+        license  | {apache-2.0,mit,bsd}
+        provider | {Meta,Microsoft}
+
+    Returns:
+        dict mapping property names to lists of values
+    """
+    result = {}
+    lines = psql_output.strip().split("\n")
+
+    # Skip header lines (first 2-3 lines are typically headers and separators)
+    data_started = False
+    for line in lines:
+        line = line.strip()
+        if not line or line.startswith("-") or "|" not in line:
+            continue
+
+        # Skip header row
+        if "array_agg" in line and not data_started:
+            data_started = True
+            continue
+
+        if not data_started:
+            continue
+
+        # Parse data row: "property_name | {val1,val2,val3}"
+        parts = line.split("|", 1)
+        if len(parts) != 2:
+            continue
+
+        property_name = parts[0].strip()
+        array_str = parts[1].strip()
+
+        # Parse PostgreSQL array format: {val1,val2,val3}
+        if array_str.startswith("{") and array_str.endswith("}"):
+            # Remove braces and split by comma
+            values_str = array_str[1:-1]
+            if values_str:
+                # Handle escaped commas and quotes properly
+                values = [v.strip().strip('"') for v in values_str.split(",")]
+                result[property_name] = values
+            else:
+                result[property_name] = []
+
+    return result
+
+
+def get_postgres_pod_in_namespace(namespace: str = "rhoai-model-registries") -> Pod:
+    """Get the PostgreSQL pod for model catalog database."""
+    postgres_pods = list(Pod.get(namespace=namespace, label_selector="app.kubernetes.io/name=model-catalog-postgres"))
+    assert postgres_pods, f"No PostgreSQL pod found in namespace {namespace}"
+    return postgres_pods[0]
+
+
+def compare_filter_options_with_database(
+    api_filters: dict[str, Any], db_properties: dict[str, list[str]], excluded_fields: set[str]
+) -> Tuple[bool, List[str]]:
+    """
+    Compare API filter options response with database query results.
+
+    Note: Currently assumes all properties are string types. Numeric/range
+    properties are not returned by the API or DB query at this time.
+
+    Args:
+        api_filters: The "filters" dict from API response
+        db_properties: Raw database properties before API filtering
+        excluded_fields: Fields that API excludes from response
+
+    Returns:
+        Tuple of (is_valid, list_of_error_messages)
+    """
+    comparison_errors = []
+
+    # Apply the same filtering logic the API uses
+    expected_properties = {name: values for name, values in db_properties.items() if name not in excluded_fields}
+
+    LOGGER.info(f"Database returned {len(db_properties)} total properties")
+    LOGGER.info(
+        f"After applying API filtering, expecting {len(expected_properties)} properties: {list(expected_properties.keys())}"  # noqa: E501
+    )
+
+    # Check for missing/extra properties
+    missing_in_api = set(expected_properties.keys()) - set(api_filters.keys())
+    extra_in_api = set(api_filters.keys()) - set(expected_properties.keys())
+
+    # Log detailed comparison for each property
+    for prop_name in sorted(set(expected_properties.keys()) | set(api_filters.keys())):
+        if prop_name in expected_properties and prop_name in api_filters:
+            db_values = set(expected_properties[prop_name])
+            api_values = set(api_filters[prop_name]["values"])
+
+            missing_values = db_values - api_values
+            extra_values = api_values - db_values
+
+            if missing_values:
+                error_msg = (
+                    f"Property '{prop_name}': DB has {len(missing_values)} values missing from API: {missing_values}"
+                )
+                LOGGER.error(error_msg)
+                comparison_errors.append(error_msg)
+            if extra_values:
+                error_msg = (
+                    f"Property '{prop_name}': API has {len(extra_values)} values missing from DB: {extra_values}"
+                )
+                LOGGER.error(error_msg)
+                comparison_errors.append(error_msg)
+            if not missing_values and not extra_values:
+                LOGGER.info(f"Property '{prop_name}': Perfect match ({len(api_values)} values)")
+        elif prop_name in expected_properties:
+            error_msg = f"Property '{prop_name}': In DB ({len(expected_properties[prop_name])} values) but NOT in API"
+            LOGGER.error(error_msg)
+            comparison_errors.append(error_msg)
+        elif prop_name in api_filters:
+            error_msg = f"Property '{prop_name}': In API ({len(api_filters[prop_name]['values'])} values) but NOT in DB"
+            LOGGER.error(error_msg)
+            comparison_errors.append(error_msg)
+
+    # Check for property-level mismatches
+    if missing_in_api:
+        comparison_errors.append(f"API missing properties found in database: {missing_in_api}")
+
+    if extra_in_api:
+        comparison_errors.append(f"API has extra properties not in database: {extra_in_api}")
+
+    is_valid = len(comparison_errors) == 0
+    return is_valid, comparison_errors
+
+
 def get_models_from_catalog_api(
     model_catalog_rest_url: list[str],
     model_registry_rest_headers: dict[str, str],
diff --git a/uv.lock b/uv.lock