Skip to content

Commit af72723

Browse files
authored
fix: caching NAD CRD should before all kubeovn crds and pod (#6198)
* fix: caching NAD CRD should before all kubeovn crds and pod --------- Signed-off-by: zbb88888 <jmdxjsjgcxy@gmail.com>
1 parent dd1880d commit af72723

File tree

4 files changed

+204
-27
lines changed

4 files changed

+204
-27
lines changed

pkg/controller/controller.go

Lines changed: 4 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -695,14 +695,17 @@ func Run(ctx context.Context, config *Configuration) {
695695
defer controller.shutdown()
696696
klog.Info("Starting OVN controller")
697697

698+
// Start and sync NAD informer first, as many resources depend on NAD cache
699+
// NAD CRD is optional, so we check if it exists before starting the informer
700+
controller.StartNetAttachInformerFactory(ctx)
701+
698702
// Wait for the caches to be synced before starting workers
699703
controller.informerFactory.Start(ctx.Done())
700704
controller.cmInformerFactory.Start(ctx.Done())
701705
controller.deployInformerFactory.Start(ctx.Done())
702706
controller.kubeovnInformerFactory.Start(ctx.Done())
703707
controller.anpInformerFactory.Start(ctx.Done())
704708
controller.StartKubevirtInformerFactory(ctx, kubevirtInformerFactory)
705-
controller.StartNetAttachInformerFactory(ctx)
706709

707710
klog.Info("Waiting for informer caches to sync")
708711
cacheSyncs := []cache.InformerSynced{

pkg/controller/network_attachment.go

Lines changed: 41 additions & 16 deletions
Original file line numberDiff line numberDiff line change
@@ -22,29 +22,54 @@ func (c *Controller) isNetAttachCRDInstalled() (bool, error) {
2222
)
2323
}
2424

25-
// the network-attachment-definition CRD is not required to be installed so
26-
// periodically check and see if we should start the informer.
25+
// startNetAttachInformer starts the NAD informer and waits for cache sync.
26+
func (c *Controller) startNetAttachInformer(ctx context.Context) {
27+
c.netAttachInformerFactory.Start(ctx.Done())
28+
if !cache.WaitForCacheSync(ctx.Done(), c.netAttachSynced) {
29+
util.LogFatalAndExit(nil, "failed to wait for network attachment cache to sync")
30+
}
31+
klog.Info("Network attachment informer cache synced")
32+
}
33+
34+
// tryStartNetAttachInformer checks if NAD CRD is installed and starts the informer if so.
35+
// Returns true if informer was started, false otherwise.
36+
func (c *Controller) tryStartNetAttachInformer(ctx context.Context) bool {
37+
exists, err := c.isNetAttachCRDInstalled()
38+
if err != nil {
39+
klog.Warningf("failed to check if network attachment CRD exists: %v", err)
40+
return false
41+
}
42+
if !exists {
43+
return false
44+
}
45+
klog.Info("Network attachment CRD found, starting informer")
46+
c.startNetAttachInformer(ctx)
47+
return true
48+
}
49+
50+
// StartNetAttachInformerFactory starts the network attachment definition (NAD) informer.
51+
// This MUST be called before other informers that depend on NAD cache (Pod, Subnet, VpcNatGateway, etc.)
52+
//
53+
// The NAD CRD is optional - if installed, we start the informer synchronously and wait for
54+
// cache sync before returning. This ensures NAD cache is ready when other controllers start
55+
// processing resources that reference NADs.
56+
//
57+
// If the CRD is not installed at startup, we start a background goroutine to periodically
58+
// check and start the informer when the CRD becomes available.
2759
func (c *Controller) StartNetAttachInformerFactory(ctx context.Context) {
60+
if c.tryStartNetAttachInformer(ctx) {
61+
return
62+
}
63+
64+
// CRD not found at startup, start background check loop
65+
klog.Info("Network attachment CRD not found at startup, will check periodically in background")
2866
ticker := time.NewTicker(10 * time.Second)
2967
go func() {
3068
defer ticker.Stop()
3169
for {
3270
select {
3371
case <-ticker.C:
34-
exists, err := c.isNetAttachCRDInstalled()
35-
if err != nil {
36-
klog.Errorf("checking network attachment CRD exists: %v", err)
37-
continue
38-
}
39-
40-
if exists {
41-
klog.Info("Start attachment informer")
42-
43-
c.netAttachInformerFactory.Start(ctx.Done())
44-
if !cache.WaitForCacheSync(ctx.Done(), c.netAttachSynced) {
45-
util.LogFatalAndExit(nil, "failed to wait for network attachment cache to sync")
46-
}
47-
72+
if c.tryStartNetAttachInformer(ctx) {
4873
return
4974
}
5075
case <-ctx.Done():

pkg/controller/vpc_nat_gateway.go

Lines changed: 29 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -323,7 +323,11 @@ func (c *Controller) handleInitVpcNatGw(key string) error {
323323
var interfaces []string
324324
if c.config.EnableNonPrimaryCNI {
325325
// extract external nad interface name
326-
externalNadNs, externalNadName := c.getExternalSubnetNad(gw)
326+
externalNadNs, externalNadName, nadErr := c.getExternalSubnetNad(gw)
327+
if nadErr != nil {
328+
klog.Errorf("failed to get external subnet NAD for gateway %s: %v", gw.Name, nadErr)
329+
return nadErr
330+
}
327331
networkStatusAnnotations := pod.Annotations[nadv1.NetworkStatusAnnot]
328332
externalNadFullName := fmt.Sprintf("%s/%s", externalNadNs, externalNadName)
329333
externalNadIfName, err := util.GetNadInterfaceFromNetworkStatusAnnotation(networkStatusAnnotations, externalNadFullName)
@@ -848,7 +852,11 @@ func (c *Controller) genNatGwStatefulSet(gw *kubeovnv1.VpcNatGateway, oldSts *v1
848852
annotations = maps.Clone(oldSts.Annotations)
849853
}
850854

851-
externalNadNamespace, externalNadName := c.getExternalSubnetNad(gw)
855+
externalNadNamespace, externalNadName, err := c.getExternalSubnetNad(gw)
856+
if err != nil {
857+
klog.Errorf("failed to get gw external subnet nad: %v", err)
858+
return nil, err
859+
}
852860

853861
eth0SubnetProvider, err := c.GetSubnetProvider(gw.Spec.Subnet)
854862
if err != nil {
@@ -1063,17 +1071,28 @@ func (c *Controller) genNatGwStatefulSet(gw *kubeovnv1.VpcNatGateway, oldSts *v1
10631071

10641072
// getExternalSubnetNad returns the namespace and name of the NetworkAttachmentDefinition associated with
10651073
// an external network attached to a NAT gateway
1066-
func (c *Controller) getExternalSubnetNad(gw *kubeovnv1.VpcNatGateway) (string, string) {
1074+
func (c *Controller) getExternalSubnetNad(gw *kubeovnv1.VpcNatGateway) (string, string, error) {
10671075
externalNadNamespace := c.config.PodNamespace
1068-
externalNadName := util.GetNatGwExternalNetwork(gw.Spec.ExternalSubnets)
1069-
if externalSubnet, err := c.subnetsLister.Get(externalNadName); err == nil {
1070-
if name, namespace, ok := util.GetNadBySubnetProvider(externalSubnet.Spec.Provider); ok {
1071-
externalNadName = name
1072-
externalNadNamespace = namespace
1073-
}
1076+
// GetNatGwExternalNetwork returns the subnet name from ExternalSubnets, or "ovn-vpc-external-network" if empty
1077+
externalSubnetName := util.GetNatGwExternalNetwork(gw.Spec.ExternalSubnets)
1078+
1079+
externalSubnet, err := c.subnetsLister.Get(externalSubnetName)
1080+
if err != nil {
1081+
err = fmt.Errorf("failed to get external subnet %s for NAT gateway %s: %w", externalSubnetName, gw.Name, err)
1082+
klog.Error(err)
1083+
return "", "", err
1084+
}
1085+
1086+
// Try to parse NAD info from subnet's provider
1087+
if name, namespace, ok := util.GetNadBySubnetProvider(externalSubnet.Spec.Provider); ok {
1088+
return namespace, name, nil
10741089
}
10751090

1076-
return externalNadNamespace, externalNadName
1091+
// Provider cannot be parsed to NAD info (e.g., provider is "ovn" or empty)
1092+
// Fall back to default NAD name which is the same as subnet name for external subnets
1093+
klog.Warningf("subnet %s provider %q cannot be parsed to NAD info, using default NAD %s/%s",
1094+
externalSubnetName, externalSubnet.Spec.Provider, externalNadNamespace, externalSubnetName)
1095+
return externalNadNamespace, externalSubnetName, nil
10771096
}
10781097

10791098
func (c *Controller) cleanUpVpcNatGw() error {

pkg/controller/vpc_nat_gateway_test.go

Lines changed: 130 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -123,3 +123,133 @@ func TestGetSubnetProvider(t *testing.T) {
123123
assert.Error(t, err, "Should error for missing subnet")
124124
})
125125
}
126+
127+
func TestGetExternalSubnetNad(t *testing.T) {
128+
tests := []struct {
129+
name string
130+
gw *kubeovnv1.VpcNatGateway
131+
subnets []*kubeovnv1.Subnet
132+
podNamespace string
133+
expectedNamespace string
134+
expectedName string
135+
expectError bool
136+
}{
137+
{
138+
name: "provider with 3 parts (name.namespace.ovn)",
139+
gw: &kubeovnv1.VpcNatGateway{
140+
ObjectMeta: metav1.ObjectMeta{Name: "test-gw"},
141+
Spec: kubeovnv1.VpcNatGatewaySpec{ExternalSubnets: []string{"external-subnet"}},
142+
},
143+
subnets: []*kubeovnv1.Subnet{
144+
{
145+
ObjectMeta: metav1.ObjectMeta{Name: "external-subnet"},
146+
Spec: kubeovnv1.SubnetSpec{Provider: "real-eip.kube-system.ovn"},
147+
},
148+
},
149+
podNamespace: "kube-system",
150+
expectedNamespace: "kube-system",
151+
expectedName: "real-eip",
152+
expectError: false,
153+
},
154+
{
155+
name: "provider with 2 parts (name.namespace)",
156+
gw: &kubeovnv1.VpcNatGateway{
157+
ObjectMeta: metav1.ObjectMeta{Name: "test-gw"},
158+
Spec: kubeovnv1.VpcNatGatewaySpec{ExternalSubnets: []string{"external-subnet"}},
159+
},
160+
subnets: []*kubeovnv1.Subnet{
161+
{
162+
ObjectMeta: metav1.ObjectMeta{Name: "external-subnet"},
163+
Spec: kubeovnv1.SubnetSpec{Provider: "my-nad.default"},
164+
},
165+
},
166+
podNamespace: "kube-system",
167+
expectedNamespace: "default",
168+
expectedName: "my-nad",
169+
expectError: false,
170+
},
171+
{
172+
name: "provider is ovn (fallback to subnet name)",
173+
gw: &kubeovnv1.VpcNatGateway{
174+
ObjectMeta: metav1.ObjectMeta{Name: "test-gw"},
175+
Spec: kubeovnv1.VpcNatGatewaySpec{ExternalSubnets: []string{"ovn-vpc-external-network"}},
176+
},
177+
subnets: []*kubeovnv1.Subnet{
178+
{
179+
ObjectMeta: metav1.ObjectMeta{Name: "ovn-vpc-external-network"},
180+
Spec: kubeovnv1.SubnetSpec{Provider: util.OvnProvider},
181+
},
182+
},
183+
podNamespace: "kube-system",
184+
expectedNamespace: "kube-system",
185+
expectedName: "ovn-vpc-external-network",
186+
expectError: false,
187+
},
188+
{
189+
name: "empty provider (fallback to subnet name)",
190+
gw: &kubeovnv1.VpcNatGateway{
191+
ObjectMeta: metav1.ObjectMeta{Name: "test-gw"},
192+
Spec: kubeovnv1.VpcNatGatewaySpec{ExternalSubnets: []string{"my-external-subnet"}},
193+
},
194+
subnets: []*kubeovnv1.Subnet{
195+
{
196+
ObjectMeta: metav1.ObjectMeta{Name: "my-external-subnet"},
197+
Spec: kubeovnv1.SubnetSpec{Provider: ""},
198+
},
199+
},
200+
podNamespace: "kube-system",
201+
expectedNamespace: "kube-system",
202+
expectedName: "my-external-subnet",
203+
expectError: false,
204+
},
205+
{
206+
name: "empty ExternalSubnets (use default)",
207+
gw: &kubeovnv1.VpcNatGateway{
208+
ObjectMeta: metav1.ObjectMeta{Name: "test-gw"},
209+
Spec: kubeovnv1.VpcNatGatewaySpec{ExternalSubnets: []string{}},
210+
},
211+
subnets: []*kubeovnv1.Subnet{
212+
{
213+
ObjectMeta: metav1.ObjectMeta{Name: "ovn-vpc-external-network"},
214+
Spec: kubeovnv1.SubnetSpec{Provider: "external.default.ovn"},
215+
},
216+
},
217+
podNamespace: "kube-system",
218+
expectedNamespace: "default",
219+
expectedName: "external",
220+
expectError: false,
221+
},
222+
{
223+
name: "subnet not found",
224+
gw: &kubeovnv1.VpcNatGateway{
225+
ObjectMeta: metav1.ObjectMeta{Name: "test-gw"},
226+
Spec: kubeovnv1.VpcNatGatewaySpec{ExternalSubnets: []string{"non-existent-subnet"}},
227+
},
228+
subnets: []*kubeovnv1.Subnet{},
229+
podNamespace: "kube-system",
230+
expectError: true,
231+
},
232+
}
233+
234+
for _, tt := range tests {
235+
t.Run(tt.name, func(t *testing.T) {
236+
fakeController, err := newFakeControllerWithOptions(t, &FakeControllerOptions{
237+
Subnets: tt.subnets,
238+
})
239+
require.NoError(t, err)
240+
controller := fakeController.fakeController
241+
controller.config.PodNamespace = tt.podNamespace
242+
243+
namespace, name, err := controller.getExternalSubnetNad(tt.gw)
244+
245+
if tt.expectError {
246+
assert.Error(t, err)
247+
return
248+
}
249+
250+
require.NoError(t, err)
251+
assert.Equal(t, tt.expectedNamespace, namespace, "namespace mismatch")
252+
assert.Equal(t, tt.expectedName, name, "name mismatch")
253+
})
254+
}
255+
}

0 commit comments

Comments
 (0)