Skip to content
This repository was archived by the owner on Feb 1, 2021. It is now read-only.

Commit dcaaa55

Browse files
authored
Merge pull request #2436 from dongluochen/rescheduleWithOverlay
Reschedule containers with global network
2 parents 25919db + c3b9a53 commit dcaaa55

File tree

1 file changed

+66
-9
lines changed

1 file changed

+66
-9
lines changed

cluster/watchdog.go

+66-9
Original file line numberDiff line numberDiff line change
@@ -2,8 +2,11 @@ package cluster
22

33
import (
44
"sync"
5+
"time"
56

67
log "github.com/Sirupsen/logrus"
8+
"github.com/docker/engine-api/types/network"
9+
"golang.org/x/net/context"
710
)
811

912
// Watchdog listens to cluster events and handles container rescheduling
@@ -61,6 +64,7 @@ func (w *Watchdog) rescheduleContainers(e *Engine) {
6164
defer w.Unlock()
6265

6366
log.Debugf("Node %s failed - rescheduling containers", e.ID)
67+
6468
for _, c := range e.Containers() {
6569

6670
// Skip containers which don't have an "on-node-failure" reschedule policy.
@@ -75,23 +79,76 @@ func (w *Watchdog) rescheduleContainers(e *Engine) {
7579
// will abort because the name is already taken.
7680
c.Engine.removeContainer(c)
7781

78-
newContainer, err := w.cluster.CreateContainer(c.Config, c.Info.Name, nil)
82+
// keep track of all global networks this container is connected to
83+
globalNetworks := make(map[string]*network.EndpointSettings)
84+
// if the existing containter has global network endpoints,
85+
// they need to be removed with force option
86+
// "docker network disconnect -f network containername" only takes containername
87+
name := c.Info.Name
88+
if len(name) == 0 || len(name) == 1 && name[0] == '/' {
89+
log.Errorf("container %s has no name", c.ID)
90+
continue
91+
}
92+
// cut preceeding '/'
93+
if name[0] == '/' {
94+
name = name[1:]
95+
}
96+
97+
if c.NetworkSettings != nil && len(c.NetworkSettings.Networks) > 0 {
98+
// find an engine to do disconnect work
99+
randomEngine, err := w.cluster.RANDOMENGINE()
100+
if err != nil {
101+
log.Errorf("Failed to find an engine to do network cleanup for container %s: %v", c.ID, err)
102+
// add the container back, so we can retry later
103+
c.Engine.AddContainer(c)
104+
continue
105+
}
106+
107+
clusterNetworks := w.cluster.Networks().Uniq()
108+
for networkName, endpoint := range c.NetworkSettings.Networks {
109+
net := clusterNetworks.Get(endpoint.NetworkID)
110+
if net != nil && net.Scope == "global" {
111+
// record the nework, they should be reconstructed on the new container
112+
globalNetworks[networkName] = endpoint
113+
ctx, cancel := context.WithTimeout(context.Background(), 10*time.Second)
114+
defer cancel()
115+
err = randomEngine.apiClient.NetworkDisconnect(ctx, networkName, name, true)
116+
if err != nil {
117+
// do not abort here as this endpoint might have been removed before
118+
log.Warnf("Failed to remove network endpoint from old container %s: %v", name, err)
119+
}
120+
}
121+
}
122+
}
79123

124+
newContainer, err := w.cluster.CreateContainer(c.Config, c.Info.Name, nil)
80125
if err != nil {
81126
log.Errorf("Failed to reschedule container %s: %v", c.ID, err)
82127
// add the container back, so we can retry later
83128
c.Engine.AddContainer(c)
84-
} else {
85-
log.Infof("Rescheduled container %s from %s to %s as %s", c.ID, c.Engine.Name, newContainer.Engine.Name, newContainer.ID)
86-
if c.Info.State.Running {
87-
log.Infof("Container %s was running, starting container %s", c.ID, newContainer.ID)
88-
if err := w.cluster.StartContainer(newContainer, nil); err != nil {
89-
log.Errorf("Failed to start rescheduled container %s: %v", newContainer.ID, err)
90-
}
129+
continue
130+
}
131+
132+
// Docker create command cannot create a container with multiple networks
133+
// see https://github.com/docker/docker/issues/17750
134+
// Add the global networks one by one
135+
for networkName, endpoint := range globalNetworks {
136+
ctx, cancel := context.WithTimeout(context.Background(), 10*time.Second)
137+
defer cancel()
138+
err = newContainer.Engine.apiClient.NetworkConnect(ctx, networkName, name, endpoint)
139+
if err != nil {
140+
log.Warnf("Failed to connect network %s to container %s: %v", networkName, name, err)
91141
}
92142
}
93-
}
94143

144+
log.Infof("Rescheduled container %s from %s to %s as %s", c.ID, c.Engine.Name, newContainer.Engine.Name, newContainer.ID)
145+
if c.Info.State.Running {
146+
log.Infof("Container %s was running, starting container %s", c.ID, newContainer.ID)
147+
if err := w.cluster.StartContainer(newContainer, nil); err != nil {
148+
log.Errorf("Failed to start rescheduled container %s: %v", newContainer.ID, err)
149+
}
150+
}
151+
}
95152
}
96153

97154
// NewWatchdog creates a new watchdog

0 commit comments

Comments
 (0)