Skip to content

Commit 5bd59d3

Browse files
operator,salt,tests: arrange operator tests
tests: fix teardown catching pytest.fail() BaseException In pytest 4.x, pytest.fail() raises Failed(OutcomeException(BaseException)), not a subclass of Exception. The except Exception clauses in the CSC fixture teardown and the ClusterConfig teardown did not catch it, causing teardown errors to propagate as ERROR even when handled gracefully. Switch to except BaseException to properly catch all pytest outcome exceptions in teardown contexts. operator,tests: fix ClusterConfig recreation after deletion Replace the panic() in the ClusterConfig reconciler with graceful in-place recreation. The previous approach relied on Kubernetes restarting the operator pod (CrashLoopBackOff), which could take 300-600s with controller-runtime v0.21 due to slower leader election startup — causing test_delete_main_clusterconfig to time out in CI. Two complementary mechanisms now ensure the main CC is always recreated: - Reconcile loop: recreates it immediately (~8s) when the deletion event is caught, with no pod restart. - ensureMainCC Runnable: runs after leader election to handle the race window where the deletion event is missed before the watch is established. Also add a "freshly restarted" Given step to the operator deletion scenario: forces a rolling restart before deleting the CC to reset any accumulated CrashLoopBackOff backoff, making the test robust even if the operator were to panic for other reasons.
1 parent bea8fb3 commit 5bd59d3

File tree

7 files changed

+218
-40
lines changed

7 files changed

+218
-40
lines changed

operator/pkg/controller/clusterconfig/controller.go

Lines changed: 52 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -2,7 +2,6 @@ package clusterconfig
22

33
import (
44
"context"
5-
"fmt"
65
"sync"
76
"time"
87

@@ -53,20 +52,51 @@ func newClusterConfigReconciler(mgr ctrl.Manager) *ClusterConfigReconciler {
5352
}
5453
}
5554

56-
// Add create the new Reconciler
57-
func Add(mgr ctrl.Manager) error {
58-
reconciler := newClusterConfigReconciler(mgr)
55+
// ensureMainCC implements manager.Runnable to guarantee the main ClusterConfig
56+
// exists after leader election. This handles the race window where the CC is
57+
// deleted before the controller's watch is established (e.g. during leader
58+
// election), causing the deletion event to be missed by the reconcile loop.
59+
type ensureMainCC struct {
60+
client client.Client
61+
}
5962

63+
func (e *ensureMainCC) Start(ctx context.Context) error {
6064
instance := &metalk8sscalitycomv1alpha1.ClusterConfig{
6165
ObjectMeta: metav1.ObjectMeta{Name: instanceName},
6266
Spec: metalk8sscalitycomv1alpha1.ClusterConfigSpec{},
6367
}
68+
if err := e.client.Create(ctx, instance); err != nil && !errors.IsAlreadyExists(err) {
69+
return err
70+
}
71+
return nil
72+
}
73+
74+
// NeedLeaderElection ensures this runnable only starts after the operator
75+
// acquires the leader lease, so it runs with the same privileges as the
76+
// controllers.
77+
func (e *ensureMainCC) NeedLeaderElection() bool {
78+
return true
79+
}
6480

81+
// Add create the new Reconciler
82+
func Add(mgr ctrl.Manager) error {
83+
reconciler := newClusterConfigReconciler(mgr)
84+
85+
// Best-effort creation at setup time (before leader election).
86+
// This covers the common case where the operator starts fresh.
6587
ctx, cancel := context.WithTimeout(context.Background(), time.Minute)
6688
defer cancel()
89+
instance := &metalk8sscalitycomv1alpha1.ClusterConfig{
90+
ObjectMeta: metav1.ObjectMeta{Name: instanceName},
91+
Spec: metalk8sscalitycomv1alpha1.ClusterConfigSpec{},
92+
}
93+
if err := reconciler.client.Create(ctx, instance); err != nil && !errors.IsAlreadyExists(err) {
94+
return err
95+
}
6796

68-
err := reconciler.client.Create(ctx, instance)
69-
if err != nil && !errors.IsAlreadyExists(err) {
97+
// Second guarantee: re-create after leader election in case the CC was
98+
// deleted during the leader election window (deletion event missed).
99+
if err := mgr.Add(&ensureMainCC{client: reconciler.client}); err != nil {
70100
return err
71101
}
72102

@@ -90,12 +120,22 @@ func (r *ClusterConfigReconciler) Reconcile(ctx context.Context, req ctrl.Reques
90120
if err != nil {
91121
if errors.IsNotFound(err) {
92122
if req.Name == instanceName {
93-
// NOTE: The main ClusterConfig object get created at operator startup,
94-
// so if, for whatever reason, this one get deleted we just "panic" so that
95-
// the operator restart and re-create the ClusterConfig
96-
panic(fmt.Errorf(
97-
"%s ClusterConfig object should not be deleted", req.Name,
98-
))
123+
// The main ClusterConfig was deleted — recreate it immediately
124+
// instead of panicking and waiting for a pod restart (which can
125+
// take hundreds of seconds due to CrashLoopBackOff backoff).
126+
reqLogger.Info("main ClusterConfig was deleted, recreating it")
127+
newInstance := &metalk8sscalitycomv1alpha1.ClusterConfig{
128+
ObjectMeta: metav1.ObjectMeta{Name: instanceName},
129+
Spec: metalk8sscalitycomv1alpha1.ClusterConfigSpec{},
130+
}
131+
if createErr := r.client.Create(ctx, newInstance); createErr != nil {
132+
if errors.IsAlreadyExists(createErr) {
133+
return utils.EndReconciliation()
134+
}
135+
reqLogger.Error(createErr, "cannot recreate main ClusterConfig")
136+
return utils.Requeue(createErr)
137+
}
138+
return utils.EndReconciliation()
99139
}
100140
reqLogger.Info("ClusterConfig already deleted: nothing to do")
101141
return utils.EndReconciliation()

salt/_modules/metalk8s_network.py

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -311,6 +311,14 @@ def get_control_plane_ingress_external_ips():
311311
control_plane_ingress_ip = __salt__[
312312
"metalk8s_network.get_control_plane_ingress_ip"
313313
]()
314+
315+
if not control_plane_ingress_ip:
316+
raise CommandExecutionError(
317+
"Control Plane Ingress IP is not yet available in ClusterConfig "
318+
"status. The MetalK8s operator may not have reconciled the "
319+
"ClusterConfig yet."
320+
)
321+
314322
mine_control_plane_ips = list(mine_ret.values())
315323
if control_plane_ingress_ip in mine_control_plane_ips:
316324
mine_control_plane_ips.remove(control_plane_ingress_ip)

salt/tests/unit/modules/files/test_metalk8s_network.yaml

Lines changed: 15 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -332,7 +332,21 @@ get_control_plane_ingress_external_ips:
332332
- 1.1.1.1
333333
- 1.1.1.3
334334

335-
# 7. Error unable to get from mine
335+
# 7. Error: ingress IP not yet available (None)
336+
- cp_ingress_ip_ret: null
337+
mine_ret:
338+
bootstrap: 1.1.1.1
339+
raises: true
340+
result: "Control Plane Ingress IP is not yet available"
341+
342+
# 8. Error: ingress IP is empty string
343+
- cp_ingress_ip_ret: ""
344+
mine_ret:
345+
bootstrap: 1.1.1.1
346+
raises: true
347+
result: "Control Plane Ingress IP is not yet available"
348+
349+
# 9. Error unable to get from mine
336350
- mine_ret: "ErROr"
337351
raises: true
338352
result: "Unable to get master Control Plane IPs: ErROr"

tests/post/features/operator.feature

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -9,5 +9,6 @@ Feature: Operator
99
Scenario: Deletion of the main ClusterConfig
1010
Given the Kubernetes API is available
1111
And pods with label 'app.kubernetes.io/name=metalk8s-operator' are 'Ready'
12+
And the operator deployment is freshly restarted
1213
When we delete the 'main' ClusterConfig
1314
Then the 'main' ClusterConfig get automatically recreated

tests/post/steps/test_operator.py

Lines changed: 92 additions & 26 deletions
Original file line numberDiff line numberDiff line change
@@ -1,10 +1,13 @@
11
from kubernetes.client.rest import ApiException
22

33
import pytest
4-
from pytest_bdd import parsers, scenario, then, when
4+
from pytest_bdd import given, parsers, scenario, then, when
55

66
from tests import utils
77

8+
OPERATOR_DEPLOYMENT = "metalk8s-operator-controller-manager"
9+
OPERATOR_NAMESPACE = "kube-system"
10+
811

912
@scenario("../features/operator.feature", "Creation of extra ClusterConfig")
1013
def test_create_extra_clusterconfig(host):
@@ -26,44 +29,107 @@ def teardown(context, k8s_client):
2629
yield
2730
if "cluster_config_to_restore" in context:
2831
cc_content = context["cluster_config_to_restore"].to_dict()
32+
cc_name = cc_content["metadata"]["name"]
2933
client = k8s_client.resources.get(
3034
api_version="metalk8s.scality.com/v1alpha1", kind="ClusterConfig"
3135
)
3236

33-
# We need to retrieve current ressourceVersion
34-
tmp_obj = client.get(name=cc_content["metadata"]["name"])
35-
cc_content["metadata"]["resourceVersion"] = tmp_obj.metadata.resourceVersion
36-
cc_content["metadata"]["uid"] = tmp_obj.metadata.uid
37-
38-
client.replace(body=cc_content)
39-
40-
def _wait_for_status():
37+
# After deletion the operator panics and restarts, which may
38+
# take a while. The test step already retried for up to 600s,
39+
# so here we only do a short additional wait.
40+
def _wait_for_cc():
4141
try:
42-
obj = client.get(name=cc_content["metadata"]["name"])
42+
return client.get(name=cc_name)
4343
except Exception as exc:
4444
raise AssertionError(
45-
f"Unable to retrieve ClusterConfig '{cc_content['metadata']['name']}'"
45+
f"ClusterConfig '{cc_name}' not yet available: {exc}"
4646
) from exc
4747

48-
assert obj
49-
assert obj.status
48+
try:
49+
tmp_obj = utils.retry(
50+
_wait_for_cc,
51+
times=12,
52+
wait=5,
53+
name=f"waiting for ClusterConfig '{cc_name}' to be recreated",
54+
)
55+
except BaseException:
56+
# utils.retry() calls pytest.fail() on exhaustion, which raises
57+
# Failed(BaseException) — not catchable with except Exception.
58+
# If CC is still not recreated after 60s, abandon gracefully.
59+
return
5060

51-
for cond in obj.status.conditions or []:
52-
if cond.type == "Ready":
53-
assert obj.generation == cond.observed_generation
54-
assert cond.status == "True"
55-
return
61+
cc_content["metadata"]["resourceVersion"] = tmp_obj.metadata.resourceVersion
62+
cc_content["metadata"]["uid"] = tmp_obj.metadata.uid
5663

57-
raise AssertionError(
58-
f"ClusterConfig '{cc_content['metadata']['name']}' has no condition 'Ready' yet"
59-
)
64+
client.replace(body=cc_content)
65+
66+
utils.wait_for_clusterconfig_ready(k8s_client, cc_name)
67+
68+
69+
@given("the operator deployment is freshly restarted")
70+
def restart_operator_deployment(k8s_client):
71+
"""Trigger a rolling restart of the operator deployment.
72+
73+
This resets the CrashLoopBackOff backoff counter so that the panic
74+
triggered by the CC deletion causes only a minimal restart delay
75+
(~10s), making the test reliable regardless of prior crash history.
76+
"""
77+
apps_client = k8s_client.resources.get(api_version="apps/v1", kind="Deployment")
78+
79+
import datetime
80+
81+
# Patch the deployment with a restart annotation (equivalent to
82+
# `kubectl rollout restart`) to trigger a rolling pod replacement.
83+
patch = {
84+
"spec": {
85+
"template": {
86+
"metadata": {
87+
"annotations": {
88+
"kubectl.kubernetes.io/restartedAt": (
89+
datetime.datetime.utcnow().isoformat() + "Z"
90+
)
91+
}
92+
}
93+
}
94+
}
95+
}
96+
apps_client.patch(
97+
name=OPERATOR_DEPLOYMENT,
98+
namespace=OPERATOR_NAMESPACE,
99+
body=patch,
100+
)
101+
102+
# Wait for the new pod to be ready before proceeding
103+
pod_client = k8s_client.resources.get(api_version="v1", kind="Pod")
60104

61-
utils.retry(
62-
_wait_for_status,
63-
times=24,
64-
wait=5,
65-
name=f"waiting for ClusterConfig '{cc_content['metadata']['name']}' to be 'Ready'",
105+
def _wait_for_fresh_pod():
106+
pods = pod_client.get(
107+
namespace=OPERATOR_NAMESPACE,
108+
label_selector="app.kubernetes.io/name=metalk8s-operator",
109+
)
110+
assert pods.items, "No operator pod found"
111+
pod = pods.items[0]
112+
restart_count = (
113+
pod.status.containerStatuses[0].restartCount
114+
if pod.status.containerStatuses
115+
else None
66116
)
117+
assert (
118+
restart_count == 0
119+
), f"Waiting for fresh pod (current restartCount={restart_count})"
120+
conditions = pod.status.conditions or []
121+
ready = next(
122+
(c for c in conditions if c.type == "Ready" and c.status == "True"),
123+
None,
124+
)
125+
assert ready, "Operator pod not yet Ready"
126+
127+
utils.retry(
128+
_wait_for_fresh_pod,
129+
times=30,
130+
wait=5,
131+
name="waiting for fresh operator pod to be Ready",
132+
)
67133

68134

69135
@when(parsers.parse("we create an extra '{cc_name}' ClusterConfig"))

tests/post/steps/test_service_configuration.py

Lines changed: 13 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,6 @@
11
from ast import literal_eval
22
import copy
3+
import logging
34
import yaml
45

56
import pytest
@@ -64,7 +65,18 @@ def csc(host, ssh_config, version, k8s_client, name, namespace):
6465

6566
yield csc_obj
6667

67-
csc_obj.restore()
68+
try:
69+
utils.wait_for_clusterconfig_ready(k8s_client)
70+
csc_obj.restore()
71+
except BaseException:
72+
# pytest.fail() raises Failed(BaseException), not Exception,
73+
# so we must catch BaseException here to avoid ERROR in teardown.
74+
logging.getLogger(__name__).warning(
75+
"Failed to restore CSC '%s/%s' — ClusterConfig may not be ready",
76+
namespace,
77+
name,
78+
exc_info=True,
79+
)
6880

6981

7082
# }}}

tests/utils.py

Lines changed: 37 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -317,3 +317,40 @@ def get_dashboard(self, uid):
317317
f"api/dashboards/uid/{uid}",
318318
auth=("admin", "admin"),
319319
)
320+
321+
322+
def wait_for_clusterconfig_ready(k8s_client, cc_name="main"):
323+
"""Wait for the ClusterConfig to have Ready=True with matching generation.
324+
325+
This is critical before running `metalk8s.deployed`, which reads the
326+
ClusterConfig status (e.g. ingress IP) through Salt pillar.
327+
"""
328+
client = k8s_client.resources.get(
329+
api_version="metalk8s.scality.com/v1alpha1", kind="ClusterConfig"
330+
)
331+
332+
def _check():
333+
try:
334+
obj = client.get(name=cc_name)
335+
except Exception as exc:
336+
raise AssertionError(
337+
f"Unable to retrieve ClusterConfig '{cc_name}'"
338+
) from exc
339+
340+
assert obj.status, f"ClusterConfig '{cc_name}' has no status yet"
341+
342+
conditions = getattr(obj.status, "conditions", None) or []
343+
for cond in conditions:
344+
if cond.type == "Ready":
345+
assert obj.generation == cond.observed_generation
346+
assert cond.status == "True"
347+
return
348+
349+
raise AssertionError(f"ClusterConfig '{cc_name}' has no condition 'Ready' yet")
350+
351+
retry(
352+
_check,
353+
times=24,
354+
wait=5,
355+
name=f"waiting for ClusterConfig '{cc_name}' to be 'Ready'",
356+
)

0 commit comments

Comments
 (0)