Skip to content

Commit fc8a536

Browse files
committed
fix: ensure kubevirt migration uses current migration state
Add MigrationUID validation to prevent stale state usage during consecutive VM migrations (e.g., A→B→A). Changes: 1. Add MigrationUID check: Only use vmi.Status.MigrationState if MigrationUID matches current vmiMigration.UID 2. Simplify MigrationScheduling: Wait for valid state instead of using Pod/vmi.Status.NodeName fallback 3. Add unit tests covering UID validation and migration scenarios The root cause was that vmi.Status.MigrationState could contain stale info from a previous migration, causing incorrect node detection and skipping SetLogicalSwitchPortMigrateOptions. Fixes: #6220 Signed-off-by: zbb88888 <jmdxjsjgcxy@gmail.com>
1 parent 8f63d67 commit fc8a536

File tree

3 files changed

+304
-71
lines changed

3 files changed

+304
-71
lines changed

pkg/controller/kubevirt.go

Lines changed: 101 additions & 70 deletions
Original file line numberDiff line numberDiff line change
@@ -18,19 +18,21 @@ import (
1818
)
1919

2020
func (c *Controller) enqueueAddVMIMigration(obj any) {
21-
key := cache.MetaObjectToName(obj.(*kubevirtv1.VirtualMachineInstanceMigration)).String()
21+
vmiMigration := obj.(*kubevirtv1.VirtualMachineInstanceMigration)
22+
key := cache.MetaObjectToName(vmiMigration).String()
2223
klog.Infof("enqueue add VMI migration %s", key)
2324
c.addOrUpdateVMIMigrationQueue.Add(key)
2425
}
2526

2627
func (c *Controller) enqueueUpdateVMIMigration(oldObj, newObj any) {
27-
oldVmi := oldObj.(*kubevirtv1.VirtualMachineInstanceMigration)
28-
newVmi := newObj.(*kubevirtv1.VirtualMachineInstanceMigration)
28+
oldVmiMigration := oldObj.(*kubevirtv1.VirtualMachineInstanceMigration)
29+
newVmiMigration := newObj.(*kubevirtv1.VirtualMachineInstanceMigration)
2930

30-
if !newVmi.DeletionTimestamp.IsZero() ||
31-
oldVmi.Status.Phase != newVmi.Status.Phase {
32-
key := cache.MetaObjectToName(newVmi).String()
33-
klog.Infof("enqueue update VMI migration %s", key)
31+
if !newVmiMigration.DeletionTimestamp.IsZero() ||
32+
oldVmiMigration.Status.Phase != newVmiMigration.Status.Phase {
33+
key := cache.MetaObjectToName(newVmiMigration).String()
34+
klog.Infof("enqueue update VMI migration %s (phase: %s -> %s)",
35+
key, oldVmiMigration.Status.Phase, newVmiMigration.Status.Phase)
3436
c.addOrUpdateVMIMigrationQueue.Add(key)
3537
}
3638
}
@@ -93,6 +95,9 @@ func (c *Controller) handleDeleteVM(key string) error {
9395
return nil
9496
}
9597

98+
// handleAddOrUpdateVMIMigration handles VirtualMachineInstanceMigration events.
99+
// KubeVirt ensures only ONE active migration per VMI at any time.
100+
// Process one migration at a time.
96101
func (c *Controller) handleAddOrUpdateVMIMigration(key string) error {
97102
namespace, name, err := cache.SplitMetaNamespaceKey(key)
98103
if err != nil {
@@ -105,102 +110,128 @@ func (c *Controller) handleAddOrUpdateVMIMigration(key string) error {
105110
utilruntime.HandleError(fmt.Errorf("failed to get VMI migration by key %s: %w", key, err))
106111
return err
107112
}
113+
114+
migrationUID := vmiMigration.UID
115+
vmiName := vmiMigration.Spec.VMIName
116+
phase := vmiMigration.Status.Phase
117+
118+
// ====== MIGRATION LIFECYCLE MARKERS ======
119+
// Log migration lifecycle events for debugging and tracking
120+
switch phase {
121+
case kubevirtv1.MigrationPending:
122+
klog.Infof(">>> [MIGRATION START] New migration %s (UID: %s) for VMI %s/%s - Phase: %s",
123+
key, migrationUID, namespace, vmiName, phase)
124+
case kubevirtv1.MigrationSucceeded:
125+
klog.Infof("<<< [MIGRATION END] Migration %s (UID: %s) for VMI %s/%s SUCCEEDED",
126+
key, migrationUID, namespace, vmiName)
127+
case kubevirtv1.MigrationFailed:
128+
klog.Infof("<<< [MIGRATION END] Migration %s (UID: %s) for VMI %s/%s FAILED",
129+
key, migrationUID, namespace, vmiName)
130+
default:
131+
klog.Infof("--- [MIGRATION PROGRESS] Migration %s (UID: %s) for VMI %s/%s - Phase: %s",
132+
key, migrationUID, namespace, vmiName, phase)
133+
}
134+
135+
// Skip if migration state is not yet initialized
108136
if vmiMigration.Status.MigrationState == nil {
109-
klog.V(3).Infof("VirtualMachineInstanceMigration %s migration state is nil, skipping", key)
137+
klog.V(3).Infof("VirtualMachineInstanceMigration %s (UID: %s) migration state is nil, waiting for KubeVirt to initialize",
138+
key, migrationUID)
110139
return nil
111140
}
112141

142+
// Skip completed migrations (already processed in final phase)
113143
if vmiMigration.Status.MigrationState.Completed {
114-
klog.V(3).Infof("VirtualMachineInstanceMigration %s migration state is completed, skipping", key)
144+
klog.V(3).Infof("VirtualMachineInstanceMigration %s (UID: %s) migration state is already completed, skipping",
145+
key, migrationUID)
115146
return nil
116147
}
117148

118-
vmi, err := c.config.KubevirtClient.VirtualMachineInstance(namespace).Get(context.TODO(), vmiMigration.Spec.VMIName, metav1.GetOptions{})
149+
// Get VMI to access current migration state
150+
vmi, err := c.config.KubevirtClient.VirtualMachineInstance(namespace).Get(context.TODO(), vmiName, metav1.GetOptions{})
119151
if err != nil {
120-
utilruntime.HandleError(fmt.Errorf("failed to get VMI by name %s: %w", vmiMigration.Spec.VMIName, err))
152+
utilruntime.HandleError(fmt.Errorf("failed to get VMI by name %s: %w", vmiName, err))
121153
return err
122154
}
123155

124-
// use VirtualMachineInstance's MigrationState because VirtualMachineInstanceMigration's MigrationState is not updated until migration finished
125-
var srcNodeName, targetNodeName string
126-
if vmi.Status.MigrationState != nil {
127-
klog.Infof("current vmiMigration %s status %s, target Node %s, source Node %s, target Pod %s, source Pod %s", key,
128-
vmiMigration.Status.Phase,
129-
vmi.Status.MigrationState.TargetNode,
130-
vmi.Status.MigrationState.SourceNode,
131-
vmi.Status.MigrationState.TargetPod,
132-
vmi.Status.MigrationState.SourcePod)
133-
srcNodeName = vmi.Status.MigrationState.SourceNode
134-
targetNodeName = vmi.Status.MigrationState.TargetNode
135-
} else {
136-
klog.Infof("current vmiMigration %s status %s, vmi MigrationState is nil", key, vmiMigration.Status.Phase)
156+
portName := ovs.PodNameToPortName(vmiName, namespace, util.OvnProvider)
157+
158+
// ====== MIGRATION UID VALIDATION ======
159+
// Only use vmi.Status.MigrationState if its MigrationUID matches the current vmiMigration.UID.
160+
// This prevents using stale state from a previous migration (e.g., stale A→B data when doing B→X migration).
161+
// Wait for MigrationState to be fully populated before processing - there's enough time between
162+
// target pod creation and actual migration start.
163+
if vmi.Status.MigrationState == nil {
164+
klog.V(3).Infof("Migration %s (UID: %s) - VMI MigrationState is nil, waiting for KubeVirt to populate",
165+
key, migrationUID)
166+
return nil
137167
}
138168

139-
portName := ovs.PodNameToPortName(vmiMigration.Spec.VMIName, vmiMigration.Namespace, util.OvnProvider)
140-
switch vmiMigration.Status.Phase {
169+
vmiMigrationUID := vmi.Status.MigrationState.MigrationUID
170+
if vmiMigrationUID != migrationUID {
171+
// Only process the current migration; previous migrations are assumed to be already handled
172+
klog.Warningf("Migration %s (UID: %s) - VMI MigrationState is STALE (contains old UID: %s), skipping",
173+
key, migrationUID, vmiMigrationUID)
174+
return nil
175+
}
176+
177+
srcNodeName := vmi.Status.MigrationState.SourceNode
178+
targetNodeName := vmi.Status.MigrationState.TargetNode
179+
if srcNodeName == "" || targetNodeName == "" {
180+
klog.V(3).Infof("Migration %s (UID: %s) - VMI MigrationState incomplete (source: %q, target: %q), waiting for KubeVirt",
181+
key, migrationUID, srcNodeName, targetNodeName)
182+
return nil
183+
}
184+
185+
klog.Infof("Migration %s (UID: %s) - source: %s, target: %s", key, migrationUID, srcNodeName, targetNodeName)
186+
187+
switch phase {
141188
case kubevirtv1.MigrationScheduling:
142-
selector, err := metav1.LabelSelectorAsSelector(&metav1.LabelSelector{
143-
MatchLabels: map[string]string{
144-
kubevirtv1.MigrationJobLabel: string(vmiMigration.UID),
145-
},
146-
})
147-
if err != nil {
148-
err = fmt.Errorf("failed to create label selector for migration job UID %s: %w", vmiMigration.UID, err)
149-
klog.Error(err)
150-
return err
189+
// Skip if source and target are the same (shouldn't happen in normal migration)
190+
if srcNodeName == targetNodeName {
191+
klog.Warningf("Migration %s (UID: %s) - Source and target are same node %s, skipping LSP setup",
192+
key, migrationUID, srcNodeName)
193+
return nil
151194
}
152195

153-
pods, err := c.podsLister.Pods(vmiMigration.Namespace).List(selector)
154-
if err != nil {
155-
err = fmt.Errorf("failed to list pods with migration job UID %s: %w", vmiMigration.UID, err)
196+
klog.Infof(">>> [MIGRATION LSP SET] Migration %s (UID: %s) - Setting LSP %s migrate options: %s -> %s",
197+
key, migrationUID, portName, srcNodeName, targetNodeName)
198+
199+
if err := c.OVNNbClient.SetLogicalSwitchPortMigrateOptions(portName, srcNodeName, targetNodeName); err != nil {
200+
err = fmt.Errorf("failed to set migrate options for lsp %s: %w", portName, err)
156201
klog.Error(err)
157202
return err
158203
}
159204

160-
if len(pods) > 0 {
161-
targetPod := pods[0]
162-
// During MigrationScheduling phase, use vmi.Status.NodeName if SourceNode is empty
163-
// because vmi.Status.MigrationState may not be fully synchronized yet
164-
sourceNode := srcNodeName
165-
if sourceNode == "" {
166-
sourceNode = vmi.Status.NodeName
167-
}
205+
klog.Infof(">>> [MIGRATION LSP SET OK] Migration %s (UID: %s) - Successfully set LSP %s migrate options",
206+
key, migrationUID, portName)
168207

169-
if sourceNode == "" || targetPod.Spec.NodeName == "" || sourceNode == targetPod.Spec.NodeName {
170-
klog.Warningf("VM pod %s/%s migration setup skipped, source node: %s, target node: %s (migration job UID: %s)",
171-
targetPod.Namespace, targetPod.Name, sourceNode, targetPod.Spec.NodeName, vmiMigration.UID)
172-
return nil
173-
}
174-
175-
klog.Infof("VM pod %s/%s is migrating from %s to %s (migration job UID: %s)",
176-
targetPod.Namespace, targetPod.Name, sourceNode, targetPod.Spec.NodeName, vmiMigration.UID)
177-
178-
if err := c.OVNNbClient.SetLogicalSwitchPortMigrateOptions(portName, sourceNode, targetPod.Spec.NodeName); err != nil {
179-
err = fmt.Errorf("failed to set migrate options for VM pod lsp %s: %w", portName, err)
180-
klog.Error(err)
181-
return err
182-
}
183-
klog.Infof("successfully set migrate options for lsp %s from %s to %s", portName, sourceNode, targetPod.Spec.NodeName)
184-
} else {
185-
klog.Warningf("target pod not yet created for migration job UID %s in phase %s, waiting for pod creation",
186-
vmiMigration.UID, vmiMigration.Status.Phase)
187-
return nil
188-
}
189208
case kubevirtv1.MigrationSucceeded:
190-
klog.Infof("migrate end reset options for lsp %s from %s to %s, migrated succeed", portName, srcNodeName, targetNodeName)
209+
klog.Infof("<<< [MIGRATION LSP RESET] Migration %s (UID: %s) - Resetting LSP %s to target %s (migration succeeded)",
210+
key, migrationUID, portName, targetNodeName)
211+
191212
if err := c.OVNNbClient.ResetLogicalSwitchPortMigrateOptions(portName, srcNodeName, targetNodeName, false); err != nil {
192-
err = fmt.Errorf("failed to clean migrate options for lsp %s, %w", portName, err)
213+
err = fmt.Errorf("failed to reset migrate options for lsp %s: %w", portName, err)
193214
klog.Error(err)
194215
return err
195216
}
217+
218+
klog.Infof("<<< [MIGRATION LSP RESET OK] Migration %s (UID: %s) - Successfully reset LSP %s",
219+
key, migrationUID, portName)
220+
196221
case kubevirtv1.MigrationFailed:
197-
klog.Infof("migrate end reset options for lsp %s from %s to %s, migrated fail", portName, srcNodeName, targetNodeName)
222+
klog.Infof("<<< [MIGRATION LSP RESET] Migration %s (UID: %s) - Resetting LSP %s to source %s (migration failed)",
223+
key, migrationUID, portName, srcNodeName)
224+
198225
if err := c.OVNNbClient.ResetLogicalSwitchPortMigrateOptions(portName, srcNodeName, targetNodeName, true); err != nil {
199-
err = fmt.Errorf("failed to clean migrate options for lsp %s, %w", portName, err)
226+
err = fmt.Errorf("failed to reset migrate options for lsp %s: %w", portName, err)
200227
klog.Error(err)
201228
return err
202229
}
230+
231+
klog.Infof("<<< [MIGRATION LSP RESET OK] Migration %s (UID: %s) - Successfully reset LSP %s",
232+
key, migrationUID, portName)
203233
}
234+
204235
return nil
205236
}
206237

0 commit comments

Comments
 (0)