Skip to content

Commit a3b0070

Browse files
authored
fix(controller): backfill nat gateway lan ip safely (kubeovn#6424)
* fix(controller): backfill nat gateway lan ip and sync status Backfill spec.lanIp from the running NAT gateway pod using StatefulSet owner references. Also mirror spec.lanIp into status and add tests for both behaviors. Signed-off-by: jimyag <git@jimyag.com> * refactor(controller): remove nat gateway status lanIp sync Keep lanIp as a spec-only field for VpcNatGateway. Remove redundant status.lanIp updates and related unit test coverage. Signed-off-by: jimyag <git@jimyag.com> * fix(controller): guard nat gw lanIp backfill when lister is nil Fallback to direct API get when vpcNatGatewayLister is not initialized. This prevents nil pointer panics in lanIp backfill test scenarios. Signed-off-by: jimyag <git@jimyag.com> * fix(controller): harden nat gw lanIp backfill checks Restrict lanIp backfill to controller namespace pods and validate parsed IP values before patching. Also simplify owner name guard logic and expand unit coverage for namespace and invalid-ip cases. Signed-off-by: jimyag <git@jimyag.com> * fix(controller): select nat gw lan ip by subnet protocol Choose backfilled lanIp using subnet protocol to avoid preferring IPv4 on IPv6 subnets. Add unit coverage for IPv6 subnet backfill behavior. Signed-off-by: jimyag <git@jimyag.com> * style(test): fix gofumpt formatting in pod backfill test Apply gofumpt formatting to TestBackfillVpcNatGwLanIPFromPod setup block. This matches CI lint expectations for Build kube-ovn. Signed-off-by: jimyag <git@jimyag.com> * fix(controller): trigger nat gateway reconcile from pod updates only Limit VpcNatGateway reconcile trigger to NAT gateway pod update paths. Remove parent enqueue logic from EIP/FIP/DNAT/SNAT child resource enqueue handlers. Keep lanIP backfill in parent reconcile. Signed-off-by: jimyag <git@jimyag.com> * chore(controller): revert non-functional nat gw handler refactors Signed-off-by: jimyag <git@jimyag.com> --------- Signed-off-by: jimyag <git@jimyag.com>
1 parent 5b43fb2 commit a3b0070

File tree

4 files changed

+284
-1
lines changed

4 files changed

+284
-1
lines changed

pkg/controller/controller_test.go

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -62,6 +62,7 @@ func alwaysReady() bool { return true }
6262
// FakeControllerOptions holds optional parameters for creating a fake controller
6363
type FakeControllerOptions struct {
6464
Subnets []*kubeovnv1.Subnet
65+
VpcNatGateways []*kubeovnv1.VpcNatGateway
6566
IPs []*kubeovnv1.IP
6667
Vlans []*kubeovnv1.Vlan
6768
NetworkAttachments []*nadv1.NetworkAttachmentDefinition
@@ -117,6 +118,13 @@ func newFakeControllerWithOptions(t *testing.T, opts *FakeControllerOptions) (*f
117118
return nil, err
118119
}
119120
}
121+
for _, gw := range opts.VpcNatGateways {
122+
_, err := kubeovnClient.KubeovnV1().VpcNatGateways().Create(
123+
context.Background(), gw, metav1.CreateOptions{})
124+
if err != nil {
125+
return nil, err
126+
}
127+
}
120128
for _, ip := range opts.IPs {
121129
_, err := kubeovnClient.KubeovnV1().IPs().Create(
122130
context.Background(), ip, metav1.CreateOptions{})

pkg/controller/pod.go

Lines changed: 104 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -543,6 +543,7 @@ func (c *Controller) handleAddOrUpdatePod(key string) (err error) {
543543

544544
isVpcNatGw, vpcGwName := c.checkIsPodVpcNatGw(pod)
545545
if isVpcNatGw {
546+
c.enqueueAddOrUpdateVpcNatGwByName(vpcGwName, "natgw-pod-update")
546547
if needRestartNatGatewayPod(pod) {
547548
klog.Infof("restarting vpc nat gateway %s", vpcGwName)
548549
c.addOrUpdateVpcNatGatewayQueue.Add(vpcGwName)
@@ -722,6 +723,7 @@ func (c *Controller) reconcileAllocateSubnets(pod *v1.Pod, needAllocatePodNets [
722723
// Check if pod is a vpc nat gateway. Annotation set will have subnet provider name as prefix
723724
isVpcNatGw, vpcGwName := c.checkIsPodVpcNatGw(pod)
724725
if isVpcNatGw {
726+
c.enqueueAddOrUpdateVpcNatGwByName(vpcGwName, "natgw-pod-update")
725727
klog.Infof("init vpc nat gateway pod %s/%s with name %s", namespace, name, vpcGwName)
726728
c.initVpcNatGatewayQueue.Add(vpcGwName)
727729
}
@@ -2637,6 +2639,108 @@ func (c *Controller) checkIsPodVpcNatGw(pod *v1.Pod) (bool, string) {
26372639
return isVpcNatGw, vpcGwName
26382640
}
26392641

2642+
func natGwNameFromStatefulSetOwner(pod *v1.Pod) string {
2643+
isStsPod, stsName, _ := isStatefulSetPod(pod)
2644+
if !isStsPod {
2645+
return ""
2646+
}
2647+
2648+
prefix := util.VpcNatGwNamePrefix + "-"
2649+
if !strings.HasPrefix(stsName, prefix) {
2650+
return ""
2651+
}
2652+
return strings.TrimPrefix(stsName, prefix)
2653+
}
2654+
2655+
func (c *Controller) backfillVpcNatGwLanIPFromPod(pod *v1.Pod, gwName string) error {
2656+
if pod == nil {
2657+
return nil
2658+
}
2659+
if pod.Namespace != c.config.PodNamespace {
2660+
return nil
2661+
}
2662+
2663+
ownerGwName := natGwNameFromStatefulSetOwner(pod)
2664+
if ownerGwName == "" {
2665+
return nil
2666+
}
2667+
if gwName == "" {
2668+
gwName = ownerGwName
2669+
}
2670+
// Use owner reference as a guard to avoid patching unrelated pods carrying a stale annotation.
2671+
if ownerGwName != gwName {
2672+
klog.Warningf("skip backfill for pod %s/%s: gw annotation %q does not match owner statefulset %q",
2673+
pod.Namespace, pod.Name, gwName, ownerGwName)
2674+
return nil
2675+
}
2676+
2677+
var (
2678+
gw *kubeovnv1.VpcNatGateway
2679+
err error
2680+
)
2681+
if c.vpcNatGatewayLister != nil {
2682+
gw, err = c.vpcNatGatewayLister.Get(gwName)
2683+
} else {
2684+
gw, err = c.config.KubeOvnClient.KubeovnV1().VpcNatGateways().Get(context.Background(), gwName, metav1.GetOptions{})
2685+
}
2686+
if err != nil {
2687+
if k8serrors.IsNotFound(err) {
2688+
return nil
2689+
}
2690+
return err
2691+
}
2692+
if gw.Spec.LanIP != "" {
2693+
return nil
2694+
}
2695+
2696+
subnet, err := c.subnetsLister.Get(gw.Spec.Subnet)
2697+
if err != nil {
2698+
return fmt.Errorf("failed to get subnet %s: %w", gw.Spec.Subnet, err)
2699+
}
2700+
if !isOvnSubnet(subnet) {
2701+
return fmt.Errorf("subnet %s is not an OVN subnet", gw.Spec.Subnet)
2702+
}
2703+
provider := subnet.Spec.Provider
2704+
2705+
lanIP := pod.Annotations[fmt.Sprintf(util.IPAddressAnnotationTemplate, provider)]
2706+
v4IP, v6IP := util.SplitStringIP(lanIP)
2707+
switch subnet.Spec.Protocol {
2708+
case kubeovnv1.ProtocolIPv6:
2709+
lanIP = v6IP
2710+
case kubeovnv1.ProtocolIPv4:
2711+
lanIP = v4IP
2712+
case kubeovnv1.ProtocolDual:
2713+
if v4IP != "" {
2714+
lanIP = v4IP
2715+
} else {
2716+
lanIP = v6IP
2717+
}
2718+
default:
2719+
lanIP = v4IP
2720+
}
2721+
if lanIP == "" || net.ParseIP(lanIP) == nil {
2722+
return nil
2723+
}
2724+
2725+
patchPayload := map[string]any{
2726+
"spec": map[string]string{
2727+
"lanIp": lanIP,
2728+
},
2729+
}
2730+
raw, err := json.Marshal(patchPayload)
2731+
if err != nil {
2732+
return err
2733+
}
2734+
2735+
_, err = c.config.KubeOvnClient.KubeovnV1().VpcNatGateways().Patch(context.Background(),
2736+
gw.Name, types.MergePatchType, raw, metav1.PatchOptions{})
2737+
if err != nil {
2738+
return err
2739+
}
2740+
klog.Infof("backfilled vpc nat gateway %s spec.lanIP with pod %s/%s ip %s", gw.Name, pod.Namespace, pod.Name, lanIP)
2741+
return nil
2742+
}
2743+
26402744
func perInterfaceIPAnnotationKey(nadName, nadNamespace, ifaceName string) string {
26412745
return fmt.Sprintf("%s.%s.kubernetes.io/ip_address.%s", nadName, nadNamespace, ifaceName)
26422746
}

pkg/controller/pod_test.go

Lines changed: 156 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,12 +1,14 @@
11
package controller
22

33
import (
4+
"context"
45
"fmt"
56
"testing"
67

78
nadv1 "github.com/k8snetworkplumbingwg/network-attachment-definition-client/pkg/apis/k8s.cni.cncf.io/v1"
89
"github.com/stretchr/testify/assert"
910
"github.com/stretchr/testify/require"
11+
appsv1 "k8s.io/api/apps/v1"
1012
corev1 "k8s.io/api/core/v1"
1113
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
1214

@@ -173,6 +175,160 @@ func TestCheckIsPodVpcNatGw(t *testing.T) {
173175
})
174176
}
175177

178+
func TestBackfillVpcNatGwLanIPFromPod(t *testing.T) {
179+
const (
180+
gwName = "test-nat-gw"
181+
subnet = "nat-subnet"
182+
provider = "net1.default.ovn"
183+
lanIP = "10.244.0.10"
184+
namespace = "default"
185+
)
186+
187+
tests := []struct {
188+
name string
189+
gwSpecLanIP string
190+
subnetProtocol string
191+
givenGwName string
192+
podOwnerName string
193+
podNamespace string
194+
controllerPodNamespace string
195+
podAnnotation map[string]string
196+
expectedLanIP string
197+
}{
198+
{
199+
name: "backfill lanIP from pod annotation",
200+
gwSpecLanIP: "",
201+
subnetProtocol: kubeovnv1.ProtocolIPv4,
202+
givenGwName: gwName,
203+
podOwnerName: util.GenNatGwName(gwName),
204+
podNamespace: namespace,
205+
controllerPodNamespace: namespace,
206+
podAnnotation: map[string]string{
207+
fmt.Sprintf(util.IPAddressAnnotationTemplate, provider): lanIP,
208+
},
209+
expectedLanIP: lanIP,
210+
},
211+
{
212+
name: "derive gateway name from owner reference",
213+
gwSpecLanIP: "",
214+
subnetProtocol: kubeovnv1.ProtocolIPv4,
215+
givenGwName: "",
216+
podOwnerName: util.GenNatGwName(gwName),
217+
podNamespace: namespace,
218+
controllerPodNamespace: namespace,
219+
podAnnotation: map[string]string{
220+
fmt.Sprintf(util.IPAddressAnnotationTemplate, provider): lanIP,
221+
},
222+
expectedLanIP: lanIP,
223+
},
224+
{
225+
name: "skip when spec lanIP already set",
226+
gwSpecLanIP: "10.244.0.99",
227+
subnetProtocol: kubeovnv1.ProtocolIPv4,
228+
givenGwName: gwName,
229+
podOwnerName: util.GenNatGwName(gwName),
230+
podNamespace: namespace,
231+
controllerPodNamespace: namespace,
232+
podAnnotation: map[string]string{
233+
fmt.Sprintf(util.IPAddressAnnotationTemplate, provider): lanIP,
234+
},
235+
expectedLanIP: "10.244.0.99",
236+
},
237+
{
238+
name: "skip when pod namespace is different from controller namespace",
239+
gwSpecLanIP: "",
240+
subnetProtocol: kubeovnv1.ProtocolIPv4,
241+
givenGwName: gwName,
242+
podOwnerName: util.GenNatGwName(gwName),
243+
podNamespace: "other-ns",
244+
controllerPodNamespace: namespace,
245+
podAnnotation: map[string]string{
246+
fmt.Sprintf(util.IPAddressAnnotationTemplate, provider): lanIP,
247+
},
248+
expectedLanIP: "",
249+
},
250+
{
251+
name: "skip when lanIP annotation is invalid",
252+
gwSpecLanIP: "",
253+
subnetProtocol: kubeovnv1.ProtocolIPv4,
254+
givenGwName: gwName,
255+
podOwnerName: util.GenNatGwName(gwName),
256+
podNamespace: namespace,
257+
controllerPodNamespace: namespace,
258+
podAnnotation: map[string]string{
259+
fmt.Sprintf(util.IPAddressAnnotationTemplate, provider): "not-an-ip",
260+
},
261+
expectedLanIP: "",
262+
},
263+
{
264+
name: "prefer IPv6 address for IPv6 subnet",
265+
gwSpecLanIP: "",
266+
subnetProtocol: kubeovnv1.ProtocolIPv6,
267+
givenGwName: gwName,
268+
podOwnerName: util.GenNatGwName(gwName),
269+
podNamespace: namespace,
270+
controllerPodNamespace: namespace,
271+
podAnnotation: map[string]string{
272+
fmt.Sprintf(util.IPAddressAnnotationTemplate, provider): "10.244.0.10,fd00:10:16::10",
273+
},
274+
expectedLanIP: "fd00:10:16::10",
275+
},
276+
}
277+
278+
for _, tt := range tests {
279+
t.Run(tt.name, func(t *testing.T) {
280+
gw := &kubeovnv1.VpcNatGateway{
281+
ObjectMeta: metav1.ObjectMeta{
282+
Name: gwName,
283+
},
284+
Spec: kubeovnv1.VpcNatGatewaySpec{
285+
Vpc: "vpc-a",
286+
Subnet: subnet,
287+
LanIP: tt.gwSpecLanIP,
288+
},
289+
}
290+
pod := &corev1.Pod{
291+
ObjectMeta: metav1.ObjectMeta{
292+
Name: util.GenNatGwPodName(gwName),
293+
Namespace: tt.podNamespace,
294+
Annotations: tt.podAnnotation,
295+
OwnerReferences: []metav1.OwnerReference{
296+
{
297+
APIVersion: appsv1.SchemeGroupVersion.String(),
298+
Kind: util.KindStatefulSet,
299+
Name: tt.podOwnerName,
300+
},
301+
},
302+
},
303+
}
304+
305+
fakeController, err := newFakeControllerWithOptions(t, &FakeControllerOptions{
306+
Subnets: []*kubeovnv1.Subnet{
307+
{
308+
ObjectMeta: metav1.ObjectMeta{Name: subnet},
309+
Spec: kubeovnv1.SubnetSpec{
310+
Provider: provider,
311+
Protocol: tt.subnetProtocol,
312+
},
313+
},
314+
},
315+
VpcNatGateways: []*kubeovnv1.VpcNatGateway{gw},
316+
})
317+
require.NoError(t, err)
318+
319+
controller := fakeController.fakeController
320+
controller.config.PodNamespace = tt.controllerPodNamespace
321+
err = controller.backfillVpcNatGwLanIPFromPod(pod, tt.givenGwName)
322+
require.NoError(t, err)
323+
324+
gotGw, err := controller.config.KubeOvnClient.KubeovnV1().VpcNatGateways().Get(
325+
context.Background(), gwName, metav1.GetOptions{})
326+
require.NoError(t, err)
327+
assert.Equal(t, tt.expectedLanIP, gotGw.Spec.LanIP)
328+
})
329+
}
330+
}
331+
176332
func TestGetPodKubeovnNetsNonPrimaryCNI(t *testing.T) {
177333
tests := []struct {
178334
name string

pkg/controller/vpc_nat_gateway.go

Lines changed: 16 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -99,6 +99,14 @@ func (c *Controller) enqueueAddVpcNatGw(obj any) {
9999
c.addOrUpdateVpcNatGatewayQueue.Add(key)
100100
}
101101

102+
func (c *Controller) enqueueAddOrUpdateVpcNatGwByName(gwName, reason string) {
103+
if gwName == "" || c.addOrUpdateVpcNatGatewayQueue == nil {
104+
return
105+
}
106+
klog.V(3).Infof("enqueue vpc-nat-gw %s from %s", gwName, reason)
107+
c.addOrUpdateVpcNatGatewayQueue.Add(gwName)
108+
}
109+
102110
func (c *Controller) enqueueUpdateVpcNatGw(_, newObj any) {
103111
key := cache.MetaObjectToName(newObj.(*kubeovnv1.VpcNatGateway)).String()
104112
klog.V(3).Infof("enqueue update vpc-nat-gw %s", key)
@@ -203,6 +211,10 @@ func (c *Controller) handleAddOrUpdateVpcNatGw(key string) error {
203211
var natGwPodContainerRestartCount int32
204212
pod, err := c.getNatGwPod(key)
205213
if err == nil {
214+
if err = c.backfillVpcNatGwLanIPFromPod(pod, key); err != nil {
215+
klog.Errorf("failed to backfill lanIP for vpc nat gateway %s: %v", key, err)
216+
return err
217+
}
206218
for _, containerStatus := range pod.Status.ContainerStatuses {
207219
if containerStatus.Name == "vpc-nat-gw" {
208220
natGwPodContainerRestartCount = containerStatus.RestartCount
@@ -224,6 +236,7 @@ func (c *Controller) handleAddOrUpdateVpcNatGw(key string) error {
224236
needToCreate, oldSts = true, nil
225237
}
226238
gwChanged := isVpcNatGwChanged(gw)
239+
needPatchStatus := gwChanged
227240

228241
newSts, err := c.genNatGwStatefulSet(gw, oldSts, natGwPodContainerRestartCount)
229242
if err != nil {
@@ -256,6 +269,9 @@ func (c *Controller) handleAddOrUpdateVpcNatGw(key string) error {
256269
klog.Error(err)
257270
return err
258271
}
272+
}
273+
274+
if needPatchStatus {
259275
if err = c.patchNatGwStatus(key); err != nil {
260276
klog.Errorf("failed to patch nat gw sts status for nat gw %s, %v", key, err)
261277
return err
@@ -1304,7 +1320,6 @@ func (c *Controller) patchNatGwStatus(key string) error {
13041320
gw.Status.Affinity = gw.Spec.Affinity
13051321
changed = true
13061322
}
1307-
13081323
if changed {
13091324
bytes, err := gw.Status.Bytes()
13101325
if err != nil {

0 commit comments

Comments
 (0)