@@ -2,8 +2,11 @@ package cluster
2
2
3
3
import (
4
4
"sync"
5
+ "time"
5
6
6
7
log "github.com/Sirupsen/logrus"
8
+ "github.com/docker/engine-api/types/network"
9
+ "golang.org/x/net/context"
7
10
)
8
11
9
12
// Watchdog listens to cluster events and handles container rescheduling
@@ -61,6 +64,7 @@ func (w *Watchdog) rescheduleContainers(e *Engine) {
61
64
defer w .Unlock ()
62
65
63
66
log .Debugf ("Node %s failed - rescheduling containers" , e .ID )
67
+
64
68
for _ , c := range e .Containers () {
65
69
66
70
// Skip containers which don't have an "on-node-failure" reschedule policy.
@@ -75,23 +79,76 @@ func (w *Watchdog) rescheduleContainers(e *Engine) {
75
79
// will abort because the name is already taken.
76
80
c .Engine .removeContainer (c )
77
81
78
- newContainer , err := w .cluster .CreateContainer (c .Config , c .Info .Name , nil )
82
+ // keep track of all global networks this container is connected to
83
+ globalNetworks := make (map [string ]* network.EndpointSettings )
84
+ // if the existing containter has global network endpoints,
85
+ // they need to be removed with force option
86
+ // "docker network disconnect -f network containername" only takes containername
87
+ name := c .Info .Name
88
+ if len (name ) == 0 || len (name ) == 1 && name [0 ] == '/' {
89
+ log .Errorf ("container %s has no name" , c .ID )
90
+ continue
91
+ }
92
+ // cut preceeding '/'
93
+ if name [0 ] == '/' {
94
+ name = name [1 :]
95
+ }
96
+
97
+ if c .NetworkSettings != nil && len (c .NetworkSettings .Networks ) > 0 {
98
+ // find an engine to do disconnect work
99
+ randomEngine , err := w .cluster .RANDOMENGINE ()
100
+ if err != nil {
101
+ log .Errorf ("Failed to find an engine to do network cleanup for container %s: %v" , c .ID , err )
102
+ // add the container back, so we can retry later
103
+ c .Engine .AddContainer (c )
104
+ continue
105
+ }
106
+
107
+ clusterNetworks := w .cluster .Networks ().Uniq ()
108
+ for networkName , endpoint := range c .NetworkSettings .Networks {
109
+ net := clusterNetworks .Get (endpoint .NetworkID )
110
+ if net != nil && net .Scope == "global" {
111
+ // record the nework, they should be reconstructed on the new container
112
+ globalNetworks [networkName ] = endpoint
113
+ ctx , cancel := context .WithTimeout (context .Background (), 10 * time .Second )
114
+ defer cancel ()
115
+ err = randomEngine .apiClient .NetworkDisconnect (ctx , networkName , name , true )
116
+ if err != nil {
117
+ // do not abort here as this endpoint might have been removed before
118
+ log .Warnf ("Failed to remove network endpoint from old container %s: %v" , name , err )
119
+ }
120
+ }
121
+ }
122
+ }
79
123
124
+ newContainer , err := w .cluster .CreateContainer (c .Config , c .Info .Name , nil )
80
125
if err != nil {
81
126
log .Errorf ("Failed to reschedule container %s: %v" , c .ID , err )
82
127
// add the container back, so we can retry later
83
128
c .Engine .AddContainer (c )
84
- } else {
85
- log .Infof ("Rescheduled container %s from %s to %s as %s" , c .ID , c .Engine .Name , newContainer .Engine .Name , newContainer .ID )
86
- if c .Info .State .Running {
87
- log .Infof ("Container %s was running, starting container %s" , c .ID , newContainer .ID )
88
- if err := w .cluster .StartContainer (newContainer , nil ); err != nil {
89
- log .Errorf ("Failed to start rescheduled container %s: %v" , newContainer .ID , err )
90
- }
129
+ continue
130
+ }
131
+
132
+ // Docker create command cannot create a container with multiple networks
133
+ // see https://github.com/docker/docker/issues/17750
134
+ // Add the global networks one by one
135
+ for networkName , endpoint := range globalNetworks {
136
+ ctx , cancel := context .WithTimeout (context .Background (), 10 * time .Second )
137
+ defer cancel ()
138
+ err = newContainer .Engine .apiClient .NetworkConnect (ctx , networkName , name , endpoint )
139
+ if err != nil {
140
+ log .Warnf ("Failed to connect network %s to container %s: %v" , networkName , name , err )
91
141
}
92
142
}
93
- }
94
143
144
+ log .Infof ("Rescheduled container %s from %s to %s as %s" , c .ID , c .Engine .Name , newContainer .Engine .Name , newContainer .ID )
145
+ if c .Info .State .Running {
146
+ log .Infof ("Container %s was running, starting container %s" , c .ID , newContainer .ID )
147
+ if err := w .cluster .StartContainer (newContainer , nil ); err != nil {
148
+ log .Errorf ("Failed to start rescheduled container %s: %v" , newContainer .ID , err )
149
+ }
150
+ }
151
+ }
95
152
}
96
153
97
154
// NewWatchdog creates a new watchdog
0 commit comments