Skip to content

Commit ec6279d

Browse files
authored
fix occasional migration failures caused by timing issues. (#6066)
* fix occasional migration failures caused by timing issues. Signed-off-by: clyi <clyi@alauda.io> * migratestate is nil when first migrateion Signed-off-by: clyi <clyi@alauda.io> --------- Signed-off-by: clyi <clyi@alauda.io>
1 parent 19153d0 commit ec6279d

File tree

2 files changed

+68
-60
lines changed

2 files changed

+68
-60
lines changed

pkg/controller/kubevirt.go

Lines changed: 68 additions & 47 deletions
Original file line numberDiff line numberDiff line change
@@ -121,63 +121,84 @@ func (c *Controller) handleAddOrUpdateVMIMigration(key string) error {
121121
return err
122122
}
123123

124-
if vmi.Status.MigrationState == nil {
125-
klog.Infof("VMI instance %s migration state is nil, skipping", key)
126-
return nil
124+
// use VirtualMachineInstance's MigrationState because VirtualMachineInstanceMigration's MigrationState is not updated until migration finished
125+
var srcNodeName, targetNodeName string
126+
if vmi.Status.MigrationState != nil {
127+
klog.Infof("current vmiMigration %s status %s, target Node %s, source Node %s, target Pod %s, source Pod %s", key,
128+
vmiMigration.Status.Phase,
129+
vmi.Status.MigrationState.TargetNode,
130+
vmi.Status.MigrationState.SourceNode,
131+
vmi.Status.MigrationState.TargetPod,
132+
vmi.Status.MigrationState.SourcePod)
133+
srcNodeName = vmi.Status.MigrationState.SourceNode
134+
targetNodeName = vmi.Status.MigrationState.TargetNode
135+
} else {
136+
klog.Infof("current vmiMigration %s status %s, vmi MigrationState is nil", key, vmiMigration.Status.Phase)
127137
}
128138

129-
if vmi.Status.MigrationState.SourcePod == "" {
130-
klog.Infof("VMI instance %s source pod is nil, skipping", key)
131-
return nil
132-
}
139+
portName := ovs.PodNameToPortName(vmiMigration.Spec.VMIName, vmiMigration.Namespace, util.OvnProvider)
140+
switch vmiMigration.Status.Phase {
141+
case kubevirtv1.MigrationScheduling:
142+
selector, err := metav1.LabelSelectorAsSelector(&metav1.LabelSelector{
143+
MatchLabels: map[string]string{
144+
"kubevirt.io/migrationJobUID": string(vmiMigration.UID),
145+
},
146+
})
147+
if err != nil {
148+
err = fmt.Errorf("failed to create label selector for migration job UID %s: %w", vmiMigration.UID, err)
149+
klog.Error(err)
150+
return err
151+
}
133152

134-
// use VirtualMachineInsance's MigrationState because VirtualMachineInsanceMigration's MigrationState is not updated util migration finished
135-
klog.Infof("current vmiMigration %s status %s, target Node %s, source Node %s, target Pod %s, source Pod %s", key,
136-
vmiMigration.Status.Phase,
137-
vmi.Status.MigrationState.TargetNode,
138-
vmi.Status.MigrationState.SourceNode,
139-
vmi.Status.MigrationState.TargetPod,
140-
vmi.Status.MigrationState.SourcePod)
153+
pods, err := c.podsLister.Pods(vmiMigration.Namespace).List(selector)
154+
if err != nil {
155+
err = fmt.Errorf("failed to list pods with migration job UID %s: %w", vmiMigration.UID, err)
156+
klog.Error(err)
157+
return err
158+
}
141159

142-
sourcePodName := vmi.Status.MigrationState.SourcePod
143-
sourcePod, err := c.config.KubeClient.CoreV1().Pods(namespace).Get(context.TODO(), sourcePodName, metav1.GetOptions{})
144-
if err != nil {
145-
err = fmt.Errorf("failed to get source pod %s, %w", sourcePodName, err)
146-
klog.Error(err)
147-
return err
148-
}
160+
if len(pods) > 0 {
161+
targetPod := pods[0]
162+
// During MigrationScheduling phase, use vmi.Status.NodeName if SourceNode is empty
163+
// because vmi.Status.MigrationState may not be fully synchronized yet
164+
sourceNode := srcNodeName
165+
if sourceNode == "" {
166+
sourceNode = vmi.Status.NodeName
167+
}
149168

150-
podNets, err := c.getPodKubeovnNets(sourcePod)
151-
if err != nil {
152-
err = fmt.Errorf("failed to get pod nets %w", err)
153-
klog.Error(err)
154-
return err
155-
}
169+
if sourceNode == "" || targetPod.Spec.NodeName == "" || sourceNode == targetPod.Spec.NodeName {
170+
klog.Warningf("VM pod %s/%s migration setup skipped, source node: %s, target node: %s (migration job UID: %s)",
171+
targetPod.Namespace, targetPod.Name, sourceNode, targetPod.Spec.NodeName, vmiMigration.UID)
172+
return nil
173+
}
156174

157-
for _, podNet := range podNets {
158-
// Skip non-OVN subnets that don't create OVN logical switch ports
159-
if !isOvnSubnet(podNet.Subnet) {
160-
continue
161-
}
175+
klog.Infof("VM pod %s/%s is migrating from %s to %s (migration job UID: %s)",
176+
targetPod.Namespace, targetPod.Name, sourceNode, targetPod.Spec.NodeName, vmiMigration.UID)
162177

163-
portName := ovs.PodNameToPortName(vmiMigration.Spec.VMIName, vmiMigration.Namespace, podNet.ProviderName)
164-
srcNodeName := vmi.Status.MigrationState.SourceNode
165-
targetNodeName := vmi.Status.MigrationState.TargetNode
166-
switch vmiMigration.Status.Phase {
167-
case kubevirtv1.MigrationSucceeded:
168-
klog.Infof("migrate end reset options for lsp %s from %s to %s, migrated succeed", portName, srcNodeName, targetNodeName)
169-
if err := c.OVNNbClient.ResetLogicalSwitchPortMigrateOptions(portName, srcNodeName, targetNodeName, false); err != nil {
170-
err = fmt.Errorf("failed to clean migrate options for lsp %s, %w", portName, err)
171-
klog.Error(err)
172-
return err
173-
}
174-
case kubevirtv1.MigrationFailed:
175-
klog.Infof("migrate end reset options for lsp %s from %s to %s, migrated fail", portName, srcNodeName, targetNodeName)
176-
if err := c.OVNNbClient.ResetLogicalSwitchPortMigrateOptions(portName, srcNodeName, targetNodeName, true); err != nil {
177-
err = fmt.Errorf("failed to clean migrate options for lsp %s, %w", portName, err)
178+
if err := c.OVNNbClient.SetLogicalSwitchPortMigrateOptions(portName, sourceNode, targetPod.Spec.NodeName); err != nil {
179+
err = fmt.Errorf("failed to set migrate options for VM pod lsp %s: %w", portName, err)
178180
klog.Error(err)
179181
return err
180182
}
183+
klog.Infof("successfully set migrate options for lsp %s from %s to %s", portName, sourceNode, targetPod.Spec.NodeName)
184+
} else {
185+
klog.Warningf("target pod not yet created for migration job UID %s in phase %s, waiting for pod creation",
186+
vmiMigration.UID, vmiMigration.Status.Phase)
187+
return nil
188+
}
189+
case kubevirtv1.MigrationSucceeded:
190+
klog.Infof("migrate end reset options for lsp %s from %s to %s, migrated succeed", portName, srcNodeName, targetNodeName)
191+
if err := c.OVNNbClient.ResetLogicalSwitchPortMigrateOptions(portName, srcNodeName, targetNodeName, false); err != nil {
192+
err = fmt.Errorf("failed to clean migrate options for lsp %s, %w", portName, err)
193+
klog.Error(err)
194+
return err
195+
}
196+
case kubevirtv1.MigrationFailed:
197+
klog.Infof("migrate end reset options for lsp %s from %s to %s, migrated fail", portName, srcNodeName, targetNodeName)
198+
if err := c.OVNNbClient.ResetLogicalSwitchPortMigrateOptions(portName, srcNodeName, targetNodeName, true); err != nil {
199+
err = fmt.Errorf("failed to clean migrate options for lsp %s, %w", portName, err)
200+
klog.Error(err)
201+
return err
181202
}
182203
}
183204
return nil

pkg/controller/pod.go

Lines changed: 0 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -648,19 +648,6 @@ func (c *Controller) reconcileAllocateSubnets(pod *v1.Pod, needAllocatePodNets [
648648
}
649649
}
650650

651-
if isVMPod {
652-
if _, ok := pod.Labels["kubevirt.io/migrationJobUID"]; ok {
653-
if sourceNode, ok := pod.Labels["kubevirt.io/nodeName"]; ok && sourceNode != pod.Spec.NodeName {
654-
klog.Infof("VM pod %s/%s is migrating from %s to %s",
655-
pod.Namespace, pod.Name, sourceNode, pod.Spec.NodeName)
656-
if err := c.OVNNbClient.SetLogicalSwitchPortMigrateOptions(portName, sourceNode, pod.Spec.NodeName); err != nil {
657-
klog.Errorf("failed to set migrate options for VM pod lsp %s: %v", portName, err)
658-
return nil, err
659-
}
660-
}
661-
}
662-
}
663-
664651
if securityGroupAnnotation != "" || oldSgList != nil {
665652
securityGroups := strings.ReplaceAll(securityGroupAnnotation, " ", "")
666653
newSgList := strings.Split(securityGroups, ",")

0 commit comments

Comments
 (0)