Skip to content

Commit 6e21af2

Browse files
mengyu0829mengyuoilbeater
authored
ip双栈的情况下gc interface失败 + sts的pod在经过多次重启后偶现chassis和pod所在节点不匹配的gc (#6033)
* 问题描述 1.ip双栈的情况下gc interface失败; 2.sts下pod在经过重启以及扩缩容后偶现chassis和pod所在节点不匹配; 产生原因 1.gc Interface通过查询ovs interface表的name,external_ids,error值,然后通过逗号分隔,ip双栈的情况下external_ids 中会出现ip=ipv4,ipv6的值,多了个逗号,再用逗号分隔,解析时按顺序获取值就错位。 2.kube-ovn中缺少对pod的chassis和pod所在节点不匹配时interface的gc处理; 解决方法 调整拼接顺序为:“name,error,external_ids”,再用strings.splitN(x,“,”,3)只分割3部分,ip双栈被整个划分到第三部分,就不会影响解析; 在原gc逻辑中增加:当查询到pod时,再比对pod所在节点和当前cni所在节点是否一致,不一致则删除interface 优化查询:以前gc时会查所有的interface,现在只查不是up状态的(状态为down或空),减少处理的数据量。 * 问题描述 1.ip双栈的情况下gc interface失败; 2.sts下pod在经过重启以及扩缩容后偶现chassis和pod所在节点不匹配; 产生原因 1.gc Interface通过查询ovs interface表的name,external_ids,error值,然后通过逗号分隔,ip双栈的情况下external_ids 中会出现ip=ipv4,ipv6的值,多了个逗号,再用逗号分隔,解析时按顺序获取值就错位。 2.kube-ovn中缺少对pod的chassis和pod所在节点不匹配时interface的gc处理; 解决方法 调整拼接顺序为:“name,error,external_ids”,再用strings.splitN(x,“,”,3)只分割3部分,ip双栈被整个划分到第三部分,就不会影响解析; 在原gc逻辑中增加:当查询到pod时,再比对pod所在节点和当前cni所在节点是否一致,不一致则删除interface 优化查询:以前gc时会查所有的interface,现在只查不是up状态的(状态为down或空),减少处理的数据量。 * 去掉没用的引用包 Signed-off-by: mengyu <mengyu@unicloud.com> * 问题描述 1.ip双栈的情况下gc interface失败; 2.sts下pod在经过重启以及扩缩容后偶现chassis和pod所在节点不匹配; 产生原因 1.gc Interface通过查询ovs interface表的name,external_ids,error值,然后通过逗号分隔,ip双栈的情况下external_ids 中会出现ip=ipv4,ipv6的值,多了个逗号,再用逗号分隔,解析时按顺序获取值就错位。 2.kube-ovn中缺少对pod的chassis和pod所在节点不匹配时interface的gc处理; 解决方法 调整拼接顺序为:“name,error,external_ids”,再用strings.splitN(x,“,”,3)只分割3部分,ip双栈被整个划分到第三部分,就不会影响解析; 在原gc逻辑中增加:当查询到pod时,再比对pod所在节点和当前cni所在节点是否一致,不一致则删除interface 优化查询:以前gc时会查所有的interface,现在只查不是up状态的(状态为down或空),减少处理的数据量。 Signed-off-by: mengyu <mengyu@unicloud.com> * 问题描述 1.ip双栈的情况下gc interface失败; 2.sts下pod在经过重启以及扩缩容后偶现chassis和pod所在节点不匹配; 产生原因 1.gc Interface通过查询ovs interface表的name,external_ids,error值,然后通过逗号分隔,ip双栈的情况下external_ids 中会出现ip=ipv4,ipv6的值,多了个逗号,再用逗号分隔,解析时按顺序获取值就错位。 2.kube-ovn中缺少对pod的chassis和pod所在节点不匹配时interface的gc处理; 解决方法 调整拼接顺序为:“name,error,external_ids”,再用strings.splitN(x,“,”,3)只分割3部分,ip双栈被整个划分到第三部分,就不会影响解析; 在原gc逻辑中增加:当查询到pod时,再比对pod所在节点和当前cni所在节点是否一致,不一致则删除interface 优化查询:以前gc时会查所有的interface,现在只查不是up状态的(状态为down或空),减少处理的数据量。 Signed-off-by: mengyu <mengyu@unicloud.com> * 去掉没用的引用包 Signed-off-by: mengyu <mengyu@unicloud.com> Signed-off-by: mengyu <mengyu@unicloud.com> * 增加空格 Signed-off-by: mengyu <mengyu@unicloud.com> * fix lint issue Signed-off-by: Mengxin Liu <liumengxinfly@gmail.com> --------- Signed-off-by: mengyu <mengyu@unicloud.com> Signed-off-by: Mengxin Liu <liumengxinfly@gmail.com> Co-authored-by: mengyu <mengyu@unicloud.com> Co-authored-by: Mengxin Liu <liumengxinfly@gmail.com>
1 parent 7a3232c commit 6e21af2

File tree

2 files changed

+33
-23
lines changed

2 files changed

+33
-23
lines changed

pkg/daemon/controller.go

Lines changed: 28 additions & 18 deletions
Original file line numberDiff line numberDiff line change
@@ -760,29 +760,39 @@ func (c *Controller) gcInterfaces() {
760760
continue
761761
}
762762

763-
if _, err := c.podsLister.Pods(podNamespace).Get(podName); err != nil && k8serrors.IsNotFound(err) {
763+
if podEntity, err := c.podsLister.Pods(podNamespace).Get(podName); err != nil {
764764
// Pod not found by name. Check if this might be a KubeVirt VM.
765765
// For KubeVirt VMs, the pod_name in OVS external_ids is set to the VM name (not the launcher pod name).
766766
// The actual launcher pod has the label 'vm.kubevirt.io/name' with the VM name as value.
767767
// Try to find launcher pods by this label.
768-
selector := labels.SelectorFromSet(map[string]string{util.KubeVirtVMNameLabel: podName})
769-
launcherPods, listErr := c.podsLister.Pods(podNamespace).List(selector)
770-
if listErr != nil {
771-
klog.Errorf("failed to list launcher pods for vm %s/%s: %v", podNamespace, podName, listErr)
772-
continue
773-
}
774-
775-
// If we found launcher pod(s) for this VM, keep the interface
776-
if len(launcherPods) > 0 {
777-
klog.V(5).Infof("found %d launcher pod(s) for vm %s/%s, keeping ovs interface %s",
778-
len(launcherPods), podNamespace, podName, iface)
779-
continue
768+
if k8serrors.IsNotFound(err) {
769+
selector := labels.SelectorFromSet(map[string]string{util.KubeVirtVMNameLabel: podName})
770+
launcherPods, listErr := c.podsLister.Pods(podNamespace).List(selector)
771+
if listErr != nil {
772+
klog.Errorf("failed to list launcher pods for vm %s/%s: %v", podNamespace, podName, listErr)
773+
continue
774+
}
775+
776+
// If we found launcher pod(s) for this VM, keep the interface
777+
if len(launcherPods) > 0 {
778+
klog.V(5).Infof("found %d launcher pod(s) for vm %s/%s, keeping ovs interface %s",
779+
len(launcherPods), podNamespace, podName, iface)
780+
continue
781+
}
782+
783+
// No pod and no launcher pod found - safe to delete
784+
klog.Infof("pod %s/%s not found, delete ovs interface %s", podNamespace, podName, iface)
785+
if err := ovs.CleanInterface(iface); err != nil {
786+
klog.Errorf("failed to clean ovs interface %s: %v", iface, err)
787+
}
780788
}
781-
782-
// No pod and no launcher pod found - safe to delete
783-
klog.Infof("pod %s/%s not found, delete ovs interface %s", podNamespace, podName, iface)
784-
if err := ovs.CleanInterface(iface); err != nil {
785-
klog.Errorf("failed to clean ovs interface %s: %v", iface, err)
789+
} else {
790+
// If the pod is found, compare the pod's node with the current cni node. If they differ, delete the interface.
791+
if podEntity.Spec.NodeName != c.config.NodeName {
792+
klog.Infof("pod %s/%s is on node %s, delete ovs interface %s on node %s ", podNamespace, podName, podEntity.Spec.NodeName, iface, c.config.NodeName)
793+
if err := ovs.CleanInterface(iface); err != nil {
794+
klog.Errorf("failed to clean ovs interface %s: %v", iface, err)
795+
}
786796
}
787797
}
788798
}

pkg/ovs/ovs-vsctl.go

Lines changed: 5 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -240,8 +240,8 @@ func ClearPodBandwidth(podName, podNamespace, ifaceID string) error {
240240
var lastInterfacePodMap map[string]string
241241

242242
func ListInterfacePodMap() (map[string]string, error) {
243-
output, err := Exec("--data=bare", "--format=csv", "--no-heading", "--columns=name,external_ids,error", "find",
244-
"interface", "external_ids:pod_name!=[]", "external_ids:pod_namespace!=[]")
243+
output, err := Exec("--data=bare", "--format=csv", "--no-heading", "--columns=name,error,external_ids", "find",
244+
"interface", "external_ids:pod_name!=[]", "external_ids:pod_namespace!=[]", "link_state!=up")
245245
if err != nil {
246246
klog.Errorf("failed to list interface, %v", err)
247247
return nil, err
@@ -252,14 +252,14 @@ func ListInterfacePodMap() (map[string]string, error) {
252252
if len(strings.TrimSpace(l)) == 0 {
253253
continue
254254
}
255-
parts := strings.Split(strings.TrimSpace(l), ",")
255+
parts := strings.SplitN(strings.TrimSpace(l), ",", 3)
256256
if len(parts) != 3 {
257257
continue
258258
}
259259
ifaceName := strings.TrimSpace(parts[0])
260-
errText := strings.TrimSpace(parts[2])
260+
errText := strings.TrimSpace(parts[1])
261261
var podNamespace, podName string
262-
for externalID := range strings.FieldsSeq(parts[1]) {
262+
for externalID := range strings.FieldsSeq(parts[2]) {
263263
if strings.Contains(externalID, "pod_name=") {
264264
podName = strings.TrimPrefix(strings.TrimSpace(externalID), "pod_name=")
265265
}

0 commit comments

Comments
 (0)