Skip to content

Commit 0f4bbc9

Browse files
committed
fix: ensure kubevirt migration uses current migration state
Add MigrationUID validation to prevent stale state usage during consecutive VM migrations (e.g., A→B→A). Changes: 1. Add MigrationUID check: Only use vmi.Status.MigrationState if MigrationUID matches current vmiMigration.UID 2. Remove early return on Completed: Allow MigrationSucceeded/Failed phases to execute Reset logic 3. Add unit tests covering UID validation and migration scenarios The root cause was that vmi.Status.MigrationState could contain stale info from a previous migration, causing incorrect node detection and skipping SetLogicalSwitchPortMigrateOptions. Fixes: #6220 Signed-off-by: zbb88888 <[email protected]>
1 parent 8f63d67 commit 0f4bbc9

File tree

3 files changed

+368
-55
lines changed

3 files changed

+368
-55
lines changed

pkg/controller/kubevirt.go

Lines changed: 151 additions & 54 deletions
Original file line numberDiff line numberDiff line change
@@ -18,19 +18,21 @@ import (
1818
)
1919

2020
func (c *Controller) enqueueAddVMIMigration(obj any) {
21-
key := cache.MetaObjectToName(obj.(*kubevirtv1.VirtualMachineInstanceMigration)).String()
21+
vmiMigration := obj.(*kubevirtv1.VirtualMachineInstanceMigration)
22+
key := cache.MetaObjectToName(vmiMigration).String()
2223
klog.Infof("enqueue add VMI migration %s", key)
2324
c.addOrUpdateVMIMigrationQueue.Add(key)
2425
}
2526

2627
func (c *Controller) enqueueUpdateVMIMigration(oldObj, newObj any) {
27-
oldVmi := oldObj.(*kubevirtv1.VirtualMachineInstanceMigration)
28-
newVmi := newObj.(*kubevirtv1.VirtualMachineInstanceMigration)
28+
oldVmiMigration := oldObj.(*kubevirtv1.VirtualMachineInstanceMigration)
29+
newVmiMigration := newObj.(*kubevirtv1.VirtualMachineInstanceMigration)
2930

30-
if !newVmi.DeletionTimestamp.IsZero() ||
31-
oldVmi.Status.Phase != newVmi.Status.Phase {
32-
key := cache.MetaObjectToName(newVmi).String()
33-
klog.Infof("enqueue update VMI migration %s", key)
31+
if !newVmiMigration.DeletionTimestamp.IsZero() ||
32+
oldVmiMigration.Status.Phase != newVmiMigration.Status.Phase {
33+
key := cache.MetaObjectToName(newVmiMigration).String()
34+
klog.Infof("enqueue update VMI migration %s (phase: %s -> %s)",
35+
key, oldVmiMigration.Status.Phase, newVmiMigration.Status.Phase)
3436
c.addOrUpdateVMIMigrationQueue.Add(key)
3537
}
3638
}
@@ -105,102 +107,197 @@ func (c *Controller) handleAddOrUpdateVMIMigration(key string) error {
105107
utilruntime.HandleError(fmt.Errorf("failed to get VMI migration by key %s: %w", key, err))
106108
return err
107109
}
110+
111+
migrationUID := vmiMigration.UID
112+
vmiName := vmiMigration.Spec.VMIName
113+
phase := vmiMigration.Status.Phase
114+
115+
// ====== MIGRATION LIFECYCLE MARKERS ======
116+
// Log migration lifecycle events for debugging and tracking
117+
switch phase {
118+
case kubevirtv1.MigrationPending:
119+
klog.Infof(">>> [MIGRATION START] New migration %s (UID: %s) for VMI %s/%s - Phase: %s",
120+
key, migrationUID, namespace, vmiName, phase)
121+
case kubevirtv1.MigrationSucceeded:
122+
klog.Infof("<<< [MIGRATION END] Migration %s (UID: %s) for VMI %s/%s SUCCEEDED",
123+
key, migrationUID, namespace, vmiName)
124+
case kubevirtv1.MigrationFailed:
125+
klog.Infof("<<< [MIGRATION END] Migration %s (UID: %s) for VMI %s/%s FAILED",
126+
key, migrationUID, namespace, vmiName)
127+
default:
128+
klog.Infof("--- [MIGRATION PROGRESS] Migration %s (UID: %s) for VMI %s/%s - Phase: %s",
129+
key, migrationUID, namespace, vmiName, phase)
130+
}
131+
132+
// Skip if migration state is not yet initialized
108133
if vmiMigration.Status.MigrationState == nil {
109-
klog.V(3).Infof("VirtualMachineInstanceMigration %s migration state is nil, skipping", key)
134+
klog.V(3).Infof("VirtualMachineInstanceMigration %s (UID: %s) migration state is nil, waiting for KubeVirt to initialize",
135+
key, migrationUID)
110136
return nil
111137
}
112138

139+
// Skip completed migrations (already processed in final phase)
113140
if vmiMigration.Status.MigrationState.Completed {
114-
klog.V(3).Infof("VirtualMachineInstanceMigration %s migration state is completed, skipping", key)
141+
klog.V(3).Infof("VirtualMachineInstanceMigration %s (UID: %s) migration state is already completed, skipping",
142+
key, migrationUID)
115143
return nil
116144
}
117145

118-
vmi, err := c.config.KubevirtClient.VirtualMachineInstance(namespace).Get(context.TODO(), vmiMigration.Spec.VMIName, metav1.GetOptions{})
146+
// Get VMI to access current migration state
147+
vmi, err := c.config.KubevirtClient.VirtualMachineInstance(namespace).Get(context.TODO(), vmiName, metav1.GetOptions{})
119148
if err != nil {
120-
utilruntime.HandleError(fmt.Errorf("failed to get VMI by name %s: %w", vmiMigration.Spec.VMIName, err))
149+
utilruntime.HandleError(fmt.Errorf("failed to get VMI by name %s: %w", vmiName, err))
121150
return err
122151
}
123152

124-
// use VirtualMachineInstance's MigrationState because VirtualMachineInstanceMigration's MigrationState is not updated until migration finished
153+
portName := ovs.PodNameToPortName(vmiName, namespace, util.OvnProvider)
154+
155+
// ====== MIGRATION UID VALIDATION ======
156+
// CRITICAL: Only use vmi.Status.MigrationState if its MigrationUID matches the current vmiMigration.UID
157+
// This prevents using stale state from a previous migration (e.g., A→B data when doing B→A migration)
125158
var srcNodeName, targetNodeName string
159+
var migrationStateValid bool
160+
126161
if vmi.Status.MigrationState != nil {
127-
klog.Infof("current vmiMigration %s status %s, target Node %s, source Node %s, target Pod %s, source Pod %s", key,
128-
vmiMigration.Status.Phase,
129-
vmi.Status.MigrationState.TargetNode,
130-
vmi.Status.MigrationState.SourceNode,
131-
vmi.Status.MigrationState.TargetPod,
132-
vmi.Status.MigrationState.SourcePod)
133-
srcNodeName = vmi.Status.MigrationState.SourceNode
134-
targetNodeName = vmi.Status.MigrationState.TargetNode
162+
vmiMigrationUID := vmi.Status.MigrationState.MigrationUID
163+
if vmiMigrationUID == migrationUID {
164+
// MigrationUID matches - this state belongs to current migration
165+
migrationStateValid = true
166+
srcNodeName = vmi.Status.MigrationState.SourceNode
167+
targetNodeName = vmi.Status.MigrationState.TargetNode
168+
klog.Infof("Migration %s (UID: %s) - VMI MigrationState is VALID (UID match), source: %s, target: %s, targetPod: %s",
169+
key, migrationUID, srcNodeName, targetNodeName, vmi.Status.MigrationState.TargetPod)
170+
} else {
171+
// MigrationUID mismatch - this is STALE state from a PREVIOUS migration
172+
// Clean up any residual migrate options from the previous migration
173+
klog.Warningf("Migration %s (UID: %s) - VMI MigrationState is STALE (UID mismatch: VMI has %s), cleaning residual migrate options",
174+
key, migrationUID, vmiMigrationUID)
175+
if err := c.OVNNbClient.CleanLogicalSwitchPortMigrateOptions(portName); err != nil {
176+
klog.Errorf("failed to clean stale migrate options for lsp %s: %v", portName, err)
177+
return err
178+
}
179+
// Don't return here - continue processing the current migration
180+
// The current migration will set new options in MigrationScheduling phase
181+
}
135182
} else {
136-
klog.Infof("current vmiMigration %s status %s, vmi MigrationState is nil", key, vmiMigration.Status.Phase)
183+
klog.V(3).Infof("Migration %s (UID: %s) - VMI MigrationState is nil, waiting for KubeVirt to populate",
184+
key, migrationUID)
137185
}
138186

139-
portName := ovs.PodNameToPortName(vmiMigration.Spec.VMIName, vmiMigration.Namespace, util.OvnProvider)
140-
switch vmiMigration.Status.Phase {
187+
switch phase {
141188
case kubevirtv1.MigrationScheduling:
189+
// During MigrationScheduling, get target node from Pod (more reliable than vmi.Status.MigrationState)
142190
selector, err := metav1.LabelSelectorAsSelector(&metav1.LabelSelector{
143191
MatchLabels: map[string]string{
144-
kubevirtv1.MigrationJobLabel: string(vmiMigration.UID),
192+
kubevirtv1.MigrationJobLabel: string(migrationUID),
145193
},
146194
})
147195
if err != nil {
148-
err = fmt.Errorf("failed to create label selector for migration job UID %s: %w", vmiMigration.UID, err)
196+
err = fmt.Errorf("failed to create label selector for migration job UID %s: %w", migrationUID, err)
149197
klog.Error(err)
150198
return err
151199
}
152200

153-
pods, err := c.podsLister.Pods(vmiMigration.Namespace).List(selector)
201+
pods, err := c.podsLister.Pods(namespace).List(selector)
154202
if err != nil {
155-
err = fmt.Errorf("failed to list pods with migration job UID %s: %w", vmiMigration.UID, err)
203+
err = fmt.Errorf("failed to list pods with migration job UID %s: %w", migrationUID, err)
156204
klog.Error(err)
157205
return err
158206
}
159207

160-
if len(pods) > 0 {
161-
targetPod := pods[0]
162-
// During MigrationScheduling phase, use vmi.Status.NodeName if SourceNode is empty
163-
// because vmi.Status.MigrationState may not be fully synchronized yet
164-
sourceNode := srcNodeName
165-
if sourceNode == "" {
166-
sourceNode = vmi.Status.NodeName
167-
}
208+
if len(pods) == 0 {
209+
klog.V(3).Infof("Migration %s (UID: %s) - Target pod not yet created, waiting for pod creation",
210+
key, migrationUID)
211+
return nil
212+
}
168213

169-
if sourceNode == "" || targetPod.Spec.NodeName == "" || sourceNode == targetPod.Spec.NodeName {
170-
klog.Warningf("VM pod %s/%s migration setup skipped, source node: %s, target node: %s (migration job UID: %s)",
171-
targetPod.Namespace, targetPod.Name, sourceNode, targetPod.Spec.NodeName, vmiMigration.UID)
172-
return nil
173-
}
214+
targetPod := pods[0]
215+
targetNode := targetPod.Spec.NodeName
174216

175-
klog.Infof("VM pod %s/%s is migrating from %s to %s (migration job UID: %s)",
176-
targetPod.Namespace, targetPod.Name, sourceNode, targetPod.Spec.NodeName, vmiMigration.UID)
217+
// Use vmi.Status.NodeName as source if MigrationState is not valid or SourceNode is empty
218+
sourceNode := srcNodeName
219+
if sourceNode == "" {
220+
sourceNode = vmi.Status.NodeName
221+
klog.V(3).Infof("Migration %s (UID: %s) - Using vmi.Status.NodeName as source: %s",
222+
key, migrationUID, sourceNode)
223+
}
177224

178-
if err := c.OVNNbClient.SetLogicalSwitchPortMigrateOptions(portName, sourceNode, targetPod.Spec.NodeName); err != nil {
179-
err = fmt.Errorf("failed to set migrate options for VM pod lsp %s: %w", portName, err)
180-
klog.Error(err)
225+
// Validate we have both source and target
226+
if sourceNode == "" || targetNode == "" {
227+
klog.Warningf("Migration %s (UID: %s) - Cannot set LSP options: source=%q, target=%q (waiting for nodes)",
228+
key, migrationUID, sourceNode, targetNode)
229+
return nil
230+
}
231+
232+
// Skip if source and target are the same (shouldn't happen in normal migration)
233+
if sourceNode == targetNode {
234+
klog.Warningf("Migration %s (UID: %s) - Source and target are same node %s, skipping LSP setup",
235+
key, migrationUID, sourceNode)
236+
return nil
237+
}
238+
239+
klog.Infof(">>> [MIGRATION LSP SET] Migration %s (UID: %s) - Setting LSP %s migrate options: %s -> %s",
240+
key, migrationUID, portName, sourceNode, targetNode)
241+
242+
if err := c.OVNNbClient.SetLogicalSwitchPortMigrateOptions(portName, sourceNode, targetNode); err != nil {
243+
err = fmt.Errorf("failed to set migrate options for lsp %s: %w", portName, err)
244+
klog.Error(err)
245+
return err
246+
}
247+
248+
klog.Infof(">>> [MIGRATION LSP SET OK] Migration %s (UID: %s) - Successfully set LSP %s migrate options",
249+
key, migrationUID, portName)
250+
251+
case kubevirtv1.MigrationSucceeded:
252+
// For final phases, we need valid migration state to know source/target
253+
if !migrationStateValid {
254+
klog.Warningf("Migration %s (UID: %s) SUCCEEDED but VMI MigrationState is invalid/stale, cannot reset LSP options properly",
255+
key, migrationUID)
256+
// Try to clean up migrate options anyway using CleanLogicalSwitchPortMigrateOptions
257+
if err := c.OVNNbClient.CleanLogicalSwitchPortMigrateOptions(portName); err != nil {
258+
klog.Errorf("failed to clean migrate options for lsp %s: %v", portName, err)
181259
return err
182260
}
183-
klog.Infof("successfully set migrate options for lsp %s from %s to %s", portName, sourceNode, targetPod.Spec.NodeName)
184-
} else {
185-
klog.Warningf("target pod not yet created for migration job UID %s in phase %s, waiting for pod creation",
186-
vmiMigration.UID, vmiMigration.Status.Phase)
187261
return nil
188262
}
189-
case kubevirtv1.MigrationSucceeded:
190-
klog.Infof("migrate end reset options for lsp %s from %s to %s, migrated succeed", portName, srcNodeName, targetNodeName)
263+
264+
klog.Infof("<<< [MIGRATION LSP RESET] Migration %s (UID: %s) - Resetting LSP %s to target %s (migration succeeded)",
265+
key, migrationUID, portName, targetNodeName)
266+
191267
if err := c.OVNNbClient.ResetLogicalSwitchPortMigrateOptions(portName, srcNodeName, targetNodeName, false); err != nil {
192-
err = fmt.Errorf("failed to clean migrate options for lsp %s, %w", portName, err)
268+
err = fmt.Errorf("failed to reset migrate options for lsp %s: %w", portName, err)
193269
klog.Error(err)
194270
return err
195271
}
272+
273+
klog.Infof("<<< [MIGRATION LSP RESET OK] Migration %s (UID: %s) - Successfully reset LSP %s",
274+
key, migrationUID, portName)
275+
196276
case kubevirtv1.MigrationFailed:
197-
klog.Infof("migrate end reset options for lsp %s from %s to %s, migrated fail", portName, srcNodeName, targetNodeName)
277+
if !migrationStateValid {
278+
klog.Warningf("Migration %s (UID: %s) FAILED but VMI MigrationState is invalid/stale, cannot reset LSP options properly",
279+
key, migrationUID)
280+
// Try to clean up migrate options anyway
281+
if err := c.OVNNbClient.CleanLogicalSwitchPortMigrateOptions(portName); err != nil {
282+
klog.Errorf("failed to clean migrate options for lsp %s: %v", portName, err)
283+
return err
284+
}
285+
return nil
286+
}
287+
288+
klog.Infof("<<< [MIGRATION LSP RESET] Migration %s (UID: %s) - Resetting LSP %s to source %s (migration failed)",
289+
key, migrationUID, portName, srcNodeName)
290+
198291
if err := c.OVNNbClient.ResetLogicalSwitchPortMigrateOptions(portName, srcNodeName, targetNodeName, true); err != nil {
199-
err = fmt.Errorf("failed to clean migrate options for lsp %s, %w", portName, err)
292+
err = fmt.Errorf("failed to reset migrate options for lsp %s: %w", portName, err)
200293
klog.Error(err)
201294
return err
202295
}
296+
297+
klog.Infof("<<< [MIGRATION LSP RESET OK] Migration %s (UID: %s) - Successfully reset LSP %s",
298+
key, migrationUID, portName)
203299
}
300+
204301
return nil
205302
}
206303

0 commit comments

Comments
 (0)