Skip to content

Commit e2d1764

Browse files
authored
Merge pull request #109 from chenchun/uid
Fix a race condition that resync released ip due to pod uid missmatch
2 parents fe02438 + 91e055d commit e2d1764

File tree

2 files changed

+50
-10
lines changed

2 files changed

+50
-10
lines changed

pkg/ipam/schedulerplugin/floatingip_plugin.go

Lines changed: 13 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -19,6 +19,7 @@ package schedulerplugin
1919
import (
2020
"fmt"
2121
"net"
22+
"runtime"
2223
"sync"
2324
"time"
2425

@@ -223,7 +224,19 @@ func (p *FloatingIPPlugin) GetIpam() floatingip.IPAM {
223224

224225
func (p *FloatingIPPlugin) lockPod(name, namespace string) func() {
225226
key := fmt.Sprintf("%s_%s", namespace, name)
227+
start := time.Now()
226228
p.podLockPool.LockKey(key)
229+
elapsed := (time.Now().UnixNano() - start.UnixNano()) / 1e6
230+
if elapsed > 500 {
231+
var caller string
232+
pc, _, no, ok := runtime.Caller(1)
233+
details := runtime.FuncForPC(pc)
234+
if ok && details != nil {
235+
caller = fmt.Sprintf("called from %s:%d\n", details.Name(), no)
236+
}
237+
glog.Infof("acquire lock for %s took %d ms, started at %s, %s", key, elapsed,
238+
start.Format("15:04:05.000"), caller)
239+
}
227240
return func() {
228241
_ = p.podLockPool.UnlockKey(key)
229242
}

pkg/ipam/schedulerplugin/resync.go

Lines changed: 37 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -77,6 +77,11 @@ func (p *FloatingIPPlugin) fetchChecklist(meta *resyncMeta) error {
7777
glog.Warningf("unexpected key: %s", fip.Key)
7878
continue
7979
}
80+
if fip.PodUid == "" && fip.NodeName == "" && !keyObj.Deployment() &&
81+
constant.ReleasePolicy(fip.Policy) == constant.ReleasePolicyNever {
82+
// skip endless checking pod exists for never policy
83+
continue
84+
}
8085
meta.allocatedIPs = append(meta.allocatedIPs, resyncObj{keyObj: keyObj, fip: fip.FloatingIP})
8186
}
8287
return nil
@@ -88,9 +93,22 @@ func (p *FloatingIPPlugin) resyncAllocatedIPs(meta *resyncMeta) {
8893
key := obj.keyObj.KeyInDB
8994
func() {
9095
defer p.lockPod(obj.keyObj.PodName, obj.keyObj.Namespace)()
91-
if p.podRunning(obj.keyObj.PodName, obj.keyObj.Namespace, obj.fip.PodUid) {
96+
// we are holding the pod's lock, query again in case the ip has been reallocated.
97+
fip, err := p.ipam.ByIP(obj.fip.IP)
98+
if err != nil {
99+
glog.Warning(err)
92100
return
93101
}
102+
if fip.Key != obj.fip.Key {
103+
// if key changed, abort
104+
return
105+
}
106+
obj.fip = fip
107+
running, reason := p.podRunning(obj.keyObj.PodName, obj.keyObj.Namespace, obj.fip.PodUid)
108+
if running {
109+
return
110+
}
111+
glog.Infof("%s is not running, %s", obj.keyObj.KeyInDB, reason)
94112
if p.cloudProvider != nil && obj.fip.NodeName != "" {
95113
// For tapp and sts pod, nodeName will be updated to empty after unassigning
96114
glog.Infof("UnAssignIP nodeName %s, ip %s, key %s during resync", obj.fip.NodeName,
@@ -122,28 +140,37 @@ func (p *FloatingIPPlugin) resyncAllocatedIPs(meta *resyncMeta) {
122140
}
123141
}
124142

125-
func (p *FloatingIPPlugin) podRunning(podName, namespace, podUid string) bool {
143+
func (p *FloatingIPPlugin) podRunning(podName, namespace, podUid string) (bool, string) {
126144
pod, err := p.PodLister.Pods(namespace).Get(podName)
127-
if runningAndUidMatch(podUid, pod, err) {
128-
return true
145+
running, reason1 := runningAndUidMatch(podUid, pod, err)
146+
if running {
147+
return true, ""
129148
}
130149
// double check with apiserver to confirm it is not running
131150
pod, err = p.Client.CoreV1().Pods(namespace).Get(podName, v1.GetOptions{})
132-
return runningAndUidMatch(podUid, pod, err)
151+
running, reason2 := runningAndUidMatch(podUid, pod, err)
152+
if running {
153+
return true, ""
154+
}
155+
return false, "from podlist: " + reason1 + ", from client: " + reason2
133156
}
134157

135-
func runningAndUidMatch(storedUid string, pod *corev1.Pod, err error) bool {
158+
func runningAndUidMatch(storedUid string, pod *corev1.Pod, err error) (bool, string) {
136159
if err != nil {
137160
if metaErrs.IsNotFound(err) {
138-
return false
161+
return false, "pod not found"
139162
}
140163
// we cannot figure out whether pod exist or not, we'd better keep the ip
141-
return true
164+
return true, ""
142165
}
143166
if storedUid != "" && storedUid != string(pod.GetUID()) {
144-
return false
167+
return false, fmt.Sprintf("pod current uid %s missmatch stored uid %s", string(pod.GetUID()), storedUid)
168+
}
169+
if !finished(pod) {
170+
return true, ""
171+
} else {
172+
return false, "pod finished"
145173
}
146-
return !finished(pod)
147174
}
148175

149176
func parsePodIndex(name string) (int, error) {

0 commit comments

Comments
 (0)