Skip to content
Merged
Show file tree
Hide file tree
Changes from 5 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
104 changes: 60 additions & 44 deletions tests/model_registry/conftest.py
Original file line number Diff line number Diff line change
Expand Up @@ -52,7 +52,7 @@
from utilities.constants import DscComponents
from model_registry import ModelRegistry as ModelRegistryClient
from utilities.general import wait_for_pods_by_labels
from utilities.infra import get_data_science_cluster
from utilities.infra import get_data_science_cluster, wait_for_dsc_status_ready

DEFAULT_TOKEN_DURATION = "10m"
LOGGER = get_logger(name=__name__)
Expand Down Expand Up @@ -169,57 +169,73 @@ def updated_dsc_component_state_scope_session(
pytestconfig: Config,
request: FixtureRequest,
admin_client: DynamicClient,
teardown_resources: bool,
) -> Generator[DataScienceCluster, Any, Any]:
dsc_resource = get_data_science_cluster(client=admin_client)
if not teardown_resources or pytestconfig.option.post_upgrade:
# if we are not tearing down resources or we are in post upgrade, we don't need to do anything
# the pre_upgrade/post_upgrade fixtures will handle the rest
yield dsc_resource
else:
original_components = dsc_resource.instance.spec.components
component_patch = {
DscComponents.MODELREGISTRY: {
"managementState": DscComponents.ManagementState.MANAGED,
"registriesNamespace": py_config["model_registry_namespace"],
},
}
LOGGER.info(f"Applying patch {component_patch}")

with ResourceEditor(patches={dsc_resource: {"spec": {"components": component_patch}}}):
for component_name in component_patch:
dsc_resource.wait_for_condition(
condition=DscComponents.COMPONENT_MAPPING[component_name], status="True"
original_namespace_name = dsc_resource.instance.spec.components.modelregistry.registriesNamespace
Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

this can be move in the if I belive

if pytestconfig.option.custom_namespace:
resource_editor = ResourceEditor(
patches={
dsc_resource: {
"spec": {
"components": {
DscComponents.MODELREGISTRY: {
"managementState": DscComponents.ManagementState.REMOVED,
"registriesNamespace": original_namespace_name,
},
}
}
}
}
)
try:
# first disable MR
resource_editor.update(backup_resources=True)
wait_for_dsc_status_ready(dsc_resource=dsc_resource)
Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

⚠️ Potential issue

Replace single-shot DSC readiness check with retry loop

Prevents transient failures after patching DSC.

-            wait_for_dsc_status_ready(dsc_resource=dsc_resource)
+            # Wait for DSC to reconcile after disabling MR (retry to avoid flakes)
+            for _ in range(60):  # ~5 minutes
+                try:
+                    wait_for_dsc_status_ready(dsc_resource=dsc_resource)
+                    break
+                except Exception:
+                    time.sleep(5)
+            else:
+                # Final raise with the same helper for context
+                wait_for_dsc_status_ready(dsc_resource=dsc_resource)

Add once near the imports:

import time
🤖 Prompt for AI Agents
In tests/model_registry/conftest.py around line 193, the single-shot call to
wait_for_dsc_status_ready(dsc_resource=dsc_resource) can fail transiently after
patching the DSC; replace it with a retry loop that attempts the readiness check
repeatedly until success or a timeout. Add import time near the top imports,
then implement a loop that records a start time, calls
wait_for_dsc_status_ready(dsc_resource=dsc_resource) inside a try/except (or
checks the return), sleeps a short interval (e.g., 1–2s) between attempts, and
breaks on success; if the timeout is exceeded, raise an informative exception so
the test fails deterministically.

# now delete the original namespace:
Comment on lines +191 to +194
Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

⚠️ Potential issue

“wait_for_dsc_status_ready” is a single-shot check, not a wait — high flake risk.

This helper returns immediately and raises if not yet Ready. After patching DSC, readiness can take time; a single check will intermittently fail.

Recommended: replace this call with a real poll here, or update utilities.infra.wait_for_dsc_status_ready to poll with a timeout. If changing here, for example:

-            resource_editor.update(backup_resources=True)
-            wait_for_dsc_status_ready(dsc_resource=dsc_resource)
+            resource_editor.update(backup_resources=True)
+            # Poll for DSC readiness to avoid flakes
+            for _ in range(36):  # ~3 minutes @5s
+                try:
+                    wait_for_dsc_status_ready(dsc_resource=dsc_resource)
+                    break
+                except Exception:
+                    import time; time.sleep(5)
+            else:
+                # Final raise with context
+                wait_for_dsc_status_ready(dsc_resource=dsc_resource)

Alternatively, upgrade the util to actually wait and keep the single call here.

📝 Committable suggestion

‼️ IMPORTANT
Carefully review the code before committing. Ensure that it accurately replaces the highlighted code, contains no missing lines, and has no issues with indentation. Thoroughly test & benchmark the code to ensure it meets the requirements.

Suggested change
# first disable MR
resource_editor.update(backup_resources=True)
wait_for_dsc_status_ready(dsc_resource=dsc_resource)
# now delete the original namespace:
# first disable MR
resource_editor.update(backup_resources=True)
# Poll for DSC readiness to avoid flakes
for _ in range(36): # ~3 minutes @5s
try:
wait_for_dsc_status_ready(dsc_resource=dsc_resource)
break
except Exception:
import time; time.sleep(5)
else:
# Final raise with context
wait_for_dsc_status_ready(dsc_resource=dsc_resource)
# now delete the original namespace:
🤖 Prompt for AI Agents
In tests/model_registry/conftest.py around lines 191-194, the call to
wait_for_dsc_status_ready is a single-shot check and causes flakes because DSC
readiness can take time; replace this single immediate call with a polling wait:
after resource_editor.update(backup_resources=True) loop until the DSC reports
Ready (or raise on timeout), checking status every 1–2 seconds and using a
sensible overall timeout (e.g., 60–120s); alternatively, modify
utilities.infra.wait_for_dsc_status_ready to perform the same
polling-with-timeout behavior and keep the single call here.

original_namespace = Namespace(name=original_namespace_name, wait_for_resource=True)
original_namespace.delete(wait=True)
Comment on lines +195 to +196
Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

do we actually need to delete it? it could be left there since we are creating it again after a few lines

Copy link
Copy Markdown
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Cleaned it that way there is no confusion about leftover resources.

# Now enable it with the custom namespace
with ResourceEditor(
patches={
dsc_resource: {
"spec": {
"components": {
DscComponents.MODELREGISTRY: {
"managementState": DscComponents.ManagementState.MANAGED,
"registriesNamespace": py_config["model_registry_namespace"],
},
}
}
}
}
):
namespace = Namespace(name=py_config["model_registry_namespace"], wait_for_resource=True)
namespace.wait_for_status(status=Namespace.Status.ACTIVE)
wait_for_pods_running(
admin_client=admin_client,
namespace_name=py_config["applications_namespace"],
number_of_consecutive_checks=6,
)
wait_for_pods_running(
admin_client=admin_client,
namespace_name=py_config["model_registry_namespace"],
number_of_consecutive_checks=6,
)
namespace = Namespace(name=py_config["model_registry_namespace"], wait_for_resource=True)
namespace.wait_for_status(status=Namespace.Status.ACTIVE)
yield dsc_resource
finally:
resource_editor.restore()
Namespace(name=py_config["model_registry_namespace"]).delete(wait=True)
# create the original namespace object again, so that we can wait for it to be created first
original_namespace = Namespace(name=original_namespace_name, wait_for_resource=True)
original_namespace.wait_for_status(status=Namespace.Status.ACTIVE)
wait_for_pods_running(
admin_client=admin_client,
namespace_name=py_config["applications_namespace"],
number_of_consecutive_checks=6,
)
Comment on lines +225 to 235
Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

⚠️ Potential issue

Restore order: recreate original NS before restoring DSC; add readiness + MR-NS pod waits

Current flow restores DSC while the original namespace may not exist, causing noisy reconciliation. Also, you only wait for pods in the applications namespace on restore.

-                yield dsc_resource
-        finally:
-            resource_editor.restore()
-            Namespace(name=py_config["model_registry_namespace"]).delete(wait=True)
-            # create the original namespace object again, so that we can wait for it to be created first
-            original_namespace = Namespace(name=original_namespace_name, wait_for_resource=True)
-            original_namespace.wait_for_status(status=Namespace.Status.ACTIVE)
-            wait_for_pods_running(
-                admin_client=admin_client,
-                namespace_name=py_config["applications_namespace"],
-                number_of_consecutive_checks=6,
-            )
+                yield dsc_resource
+        finally:
+            # First, delete the custom namespace while MR is still Removed (after exiting the inner editor)
+            if py_config["model_registry_namespace"] != original_namespace_name:
+                Namespace(name=py_config["model_registry_namespace"]).delete(wait=True)
+            # Ensure the original namespace exists before pointing DSC back to it
+            original_namespace = Namespace(name=original_namespace_name, wait_for_resource=True)
+            original_namespace.wait_for_status(status=Namespace.Status.ACTIVE)
+            # Restore DSC to its original spec (likely Managed in the original namespace)
+            resource_editor.restore()
+            # Wait for DSC to become READY again (retry to avoid flakes)
+            for _ in range(60):
+                try:
+                    wait_for_dsc_status_ready(dsc_resource=dsc_resource)
+                    break
+                except Exception:
+                    time.sleep(5)
+            else:
+                wait_for_dsc_status_ready(dsc_resource=dsc_resource)
+            # Wait for pods to settle in both app and MR namespaces
+            wait_for_pods_running(
+                admin_client=admin_client,
+                namespace_name=py_config["applications_namespace"],
+                number_of_consecutive_checks=6,
+            )
+            wait_for_pods_running(
+                admin_client=admin_client,
+                namespace_name=original_namespace_name,
+                number_of_consecutive_checks=6,
+            )

Committable suggestion skipped: line range outside the PR's diff.

🤖 Prompt for AI Agents
In tests/model_registry/conftest.py around lines 225 to 235, the teardown
currently restores the DSC before ensuring the original namespace exists and
only waits for pods in the applications namespace; change the order to recreate
the original Namespace object (with wait_for_resource=True) and wait for it to
reach ACTIVE before calling resource_editor.restore(), and after restore add
wait_for_pods_running checks for both the model-registry namespace
(py_config["model_registry_namespace"]) and the applications namespace
(py_config["applications_namespace"]) with appropriate
number_of_consecutive_checks (e.g., 6) to ensure readiness of MR-NS pods as well
as app pods.

wait_for_pods_running(
admin_client=admin_client,
namespace_name=py_config["model_registry_namespace"],
number_of_consecutive_checks=6,
)
yield dsc_resource

for component_name, value in component_patch.items():
LOGGER.info(f"Waiting for component {component_name} to be updated.")
if original_components[component_name]["managementState"] == DscComponents.ManagementState.MANAGED:
dsc_resource.wait_for_condition(
condition=DscComponents.COMPONENT_MAPPING[component_name], status="True"
)
if (
component_name == DscComponents.MODELREGISTRY
and value.get("managementState") == DscComponents.ManagementState.MANAGED
):
# Since namespace specified in registriesNamespace is automatically created after setting
# managementStateto Managed. We need to explicitly delete it on clean up.
namespace = Namespace(name=py_config["model_registry_namespace"], ensure_exists=True)
if namespace:
namespace.delete(wait=True)
else:
LOGGER.info("Model Registry is enabled by default and does not require any setup.")
yield dsc_resource


@pytest.fixture(scope="class")
Expand Down
65 changes: 0 additions & 65 deletions tests/model_registry/upgrade/conftest.py

This file was deleted.

Original file line number Diff line number Diff line change
Expand Up @@ -28,7 +28,7 @@
],
indirect=True,
)
@pytest.mark.usefixtures("pre_upgrade_dsc_patch", "model_registry_metadata_db_resources", "model_registry_instance")
@pytest.mark.usefixtures("model_registry_metadata_db_resources", "model_registry_instance")
class TestPreUpgradeModelRegistry:
@pytest.mark.pre_upgrade
def test_registering_model_pre_upgrade(
Expand All @@ -43,7 +43,6 @@ def test_registering_model_pre_upgrade(
pytest.fail("errors found in model registry response validation:\n{}".format("\n".join(errors)))


@pytest.mark.usefixtures("post_upgrade_dsc_patch")
class TestPostUpgradeModelRegistry:
@pytest.mark.post_upgrade
def test_retrieving_model_post_upgrade(
Expand Down