Skip to content

Commit 95b2b80

Browse files
oilbeaterclaude
andcommitted
fix: prevent subnet from getting permanently stuck when VLAN is not ready (#6352)
Fix two bugs that combine to cause underlay subnets to get permanently stuck during controller startup when the VLAN is created after the subnet. Bug 1: In handleAddOrUpdateSubnet, variable shadowing (err :=) and overwriting (err =) in the VLAN/subnet validation error paths caused patchSubnetStatus success to zero out the original validation error. The handler returned nil, making the work queue forget the item instead of retrying it. Fix by using a separate patchErr variable for the patch call and using = instead of := for the error wrapping. Bug 2: handleAddVlan did not re-enqueue subnets that reference the newly created VLAN. Once a subnet's validation failed and was forgotten by the queue, no event would trigger it to be reprocessed. Fix by iterating over subnets at the end of handleAddVlan and adding those referencing the VLAN back to the addOrUpdateSubnetQueue. Signed-off-by: Mengxin Liu <liumengxinfly@gmail.com> Co-authored-by: Claude Opus 4.6 <noreply@anthropic.com> (cherry picked from commit e9b65ce)
1 parent ca9282c commit 95b2b80

File tree

4 files changed

+60
-7
lines changed

4 files changed

+60
-7
lines changed

pkg/controller/controller_test.go

Lines changed: 17 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -28,6 +28,8 @@ import (
2828
"k8s.io/client-go/informers"
2929
coreinformers "k8s.io/client-go/informers/core/v1"
3030
"k8s.io/client-go/kubernetes/fake"
31+
"k8s.io/client-go/tools/record"
32+
"k8s.io/utils/keymutex"
3133

3234
mockovs "github.com/kubeovn/kube-ovn/mocks/pkg/ovs"
3335
kubeovnv1 "github.com/kubeovn/kube-ovn/pkg/apis/kubeovn/v1"
@@ -43,6 +45,7 @@ type fakeControllerInformers struct {
4345
vpcNatGwInformer kubeovninformer.VpcNatGatewayInformer
4446
subnetInformer kubeovninformer.SubnetInformer
4547
ipInformer kubeovninformer.IPInformer
48+
vlanInformer kubeovninformer.VlanInformer
4649
serviceInformer coreinformers.ServiceInformer
4750
namespaceInformer coreinformers.NamespaceInformer
4851
podInformer coreinformers.PodInformer
@@ -60,6 +63,7 @@ func alwaysReady() bool { return true }
6063
type FakeControllerOptions struct {
6164
Subnets []*kubeovnv1.Subnet
6265
IPs []*kubeovnv1.IP
66+
Vlans []*kubeovnv1.Vlan
6367
NetworkAttachments []*nadv1.NetworkAttachmentDefinition
6468
Pods []*corev1.Pod
6569
Namespaces []*corev1.Namespace
@@ -122,6 +126,13 @@ func newFakeControllerWithOptions(t *testing.T, opts *FakeControllerOptions) (*f
122126
return nil, err
123127
}
124128
}
129+
for _, vlan := range opts.Vlans {
130+
_, err := kubeovnClient.KubeovnV1().Vlans().Create(
131+
context.Background(), vlan, metav1.CreateOptions{})
132+
if err != nil {
133+
return nil, err
134+
}
135+
}
125136

126137
// Create informer factories
127138
kubeInformerFactory := informers.NewSharedInformerFactory(kubeClient, 0)
@@ -137,12 +148,14 @@ func newFakeControllerWithOptions(t *testing.T, opts *FakeControllerOptions) (*f
137148
subnetInformer := kubeovnInformerFactory.Kubeovn().V1().Subnets()
138149
ipInformer := kubeovnInformerFactory.Kubeovn().V1().IPs()
139150
vpcNatGwInformer := kubeovnInformerFactory.Kubeovn().V1().VpcNatGateways()
151+
vlanInformer := kubeovnInformerFactory.Kubeovn().V1().Vlans()
140152

141153
fakeInformers := &fakeControllerInformers{
142154
vpcInformer: vpcInformer,
143155
vpcNatGwInformer: vpcNatGwInformer,
144156
subnetInformer: subnetInformer,
145157
ipInformer: ipInformer,
158+
vlanInformer: vlanInformer,
146159
serviceInformer: serviceInformer,
147160
namespaceInformer: namespaceInformer,
148161
podInformer: podInformer,
@@ -162,10 +175,14 @@ func newFakeControllerWithOptions(t *testing.T, opts *FakeControllerOptions) (*f
162175
subnetSynced: alwaysReady,
163176
ipsLister: ipInformer.Lister(),
164177
ipSynced: alwaysReady,
178+
vlansLister: vlanInformer.Lister(),
165179
netAttachLister: nadInformer.Lister(),
166180
netAttachSynced: alwaysReady,
167181
OVNNbClient: mockOvnClient,
168182
ipam: ovnipam.NewIPAM(),
183+
recorder: record.NewFakeRecorder(100),
184+
subnetKeyMutex: keymutex.NewHashed(0),
185+
addOrUpdateSubnetQueue: newTypedRateLimitingQueue[string]("AddOrUpdateSubnet", nil),
169186
syncVirtualPortsQueue: newTypedRateLimitingQueue[string]("SyncVirtualPort", nil),
170187
updateSubnetStatusQueue: newTypedRateLimitingQueue[string]("UpdateSubnetStatus", nil),
171188
}

pkg/controller/subnet.go

Lines changed: 7 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -495,20 +495,20 @@ func (c *Controller) handleAddOrUpdateSubnet(key string) error {
495495

496496
err = c.validateSubnetVlan(subnet)
497497
if err != nil {
498-
err := fmt.Errorf("failed to validate vlan for subnet %s, %w", key, err)
498+
err = fmt.Errorf("failed to validate vlan for subnet %s, %w", key, err)
499499
klog.Error(err)
500-
if err = c.patchSubnetStatus(subnet, "ValidateSubnetVlanFailed", err.Error()); err != nil {
501-
klog.Error(err)
502-
return err
500+
if patchErr := c.patchSubnetStatus(subnet, "ValidateSubnetVlanFailed", err.Error()); patchErr != nil {
501+
klog.Error(patchErr)
502+
return patchErr
503503
}
504504
return err
505505
}
506506

507507
if err = util.ValidateSubnet(*subnet); err != nil {
508508
klog.Errorf("failed to validate subnet %s, %v", subnet.Name, err)
509-
if err = c.patchSubnetStatus(subnet, "ValidateLogicalSwitchFailed", err.Error()); err != nil {
510-
klog.Error(err)
511-
return err
509+
if patchErr := c.patchSubnetStatus(subnet, "ValidateLogicalSwitchFailed", err.Error()); patchErr != nil {
510+
klog.Error(patchErr)
511+
return patchErr
512512
}
513513
return err
514514
}

pkg/controller/subnet_test.go

Lines changed: 28 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -276,6 +276,34 @@ func Test_formatSubnet(t *testing.T) {
276276
}
277277
}
278278

279+
func Test_handleAddOrUpdateSubnet_vlanValidationError(t *testing.T) {
280+
t.Parallel()
281+
282+
// Create a subnet that references a non-existent vlan
283+
subnet := &kubeovnv1.Subnet{
284+
ObjectMeta: metav1.ObjectMeta{
285+
Name: "test-underlay",
286+
},
287+
Spec: kubeovnv1.SubnetSpec{
288+
CIDRBlock: "10.0.0.0/24",
289+
Gateway: "10.0.0.1",
290+
Vlan: "non-existent-vlan",
291+
},
292+
}
293+
294+
fakeController, err := newFakeControllerWithOptions(t, &FakeControllerOptions{
295+
Subnets: []*kubeovnv1.Subnet{subnet},
296+
})
297+
require.NoError(t, err)
298+
ctrl := fakeController.fakeController
299+
300+
// handleAddOrUpdateSubnet should return an error when the vlan does not exist,
301+
// so that the work queue retries the item instead of forgetting it
302+
err = ctrl.handleAddOrUpdateSubnet("test-underlay")
303+
require.Error(t, err)
304+
require.Contains(t, err.Error(), "failed to validate vlan")
305+
}
306+
279307
func Test_isOvnSubnet(t *testing.T) {
280308
t.Parallel()
281309

pkg/controller/vlan.go

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -116,6 +116,14 @@ func (c *Controller) handleAddVlan(key string) error {
116116
}
117117
}
118118

119+
// re-enqueue subnets that reference this vlan, so they can proceed
120+
// if they were previously blocked by the vlan not being ready
121+
for _, subnet := range subnets {
122+
if subnet.Spec.Vlan == vlan.Name {
123+
c.addOrUpdateSubnetQueue.Add(subnet.Name)
124+
}
125+
}
126+
119127
return nil
120128
}
121129

0 commit comments

Comments
 (0)