@@ -17,12 +17,12 @@ import (
1717 v1 "k8s.io/api/core/v1"
1818)
1919
20- // TODO make configurable as flags.
21- var (
22- defaultRetryCount = 999
23- defaultRetryWaitTime = 10 * time .Second
24- defaultRetryMaxWaitTime = 30 * time .Second
25- )
20+ // Restry Configuration
21+ type RestyConfig struct {
22+ ClientRetryCount int
23+ ClientRetryWaitTime time.Duration
24+ ClientRetryMaxWaitTime time.Duration
25+ }
2626
2727// ESClient is a pod drainer which can drain data from Elasticsearch pods.
2828type ESClient struct {
@@ -91,7 +91,7 @@ func (c *ESClient) logger() *log.Entry {
9191}
9292
9393// Drain drains data from an Elasticsearch pod.
94- func (c * ESClient ) Drain (ctx context.Context , pod * v1.Pod ) error {
94+ func (c * ESClient ) Drain (ctx context.Context , pod * v1.Pod , config * RestyConfig ) error {
9595
9696 c .logger ().Info ("Ensuring cluster is in green state" )
9797
@@ -111,7 +111,7 @@ func (c *ESClient) Drain(ctx context.Context, pod *v1.Pod) error {
111111 }
112112
113113 c .logger ().Info ("Waiting for draining to finish" )
114- return c .waitForEmptyEsNode (ctx , pod )
114+ return c .waitForEmptyEsNode (ctx , pod , config )
115115}
116116
117117func (c * ESClient ) Cleanup (ctx context.Context ) error {
@@ -220,6 +220,7 @@ func (c *ESClient) excludePodIP(pod *v1.Pod) error {
220220 if excludeString != "" {
221221 ips = strings .Split (excludeString , "," )
222222 }
223+
223224 var foundPodIP bool
224225 for _ , ip := range ips {
225226 if ip == podIP {
@@ -256,6 +257,42 @@ func (c *ESClient) setExcludeIPs(ips string) error {
256257 return nil
257258}
258259
260+ // remove the podIP from Elasticsearch exclude._ip list
261+ func (c * ESClient ) removeFromExcludeIPList (pod * v1.Pod ) error {
262+
263+ c .mux .Lock ()
264+ defer c .mux .Unlock ()
265+
266+ podIP := pod .Status .PodIP
267+
268+ esSettings , err := c .getClusterSettings ()
269+ if err != nil {
270+ return err
271+ }
272+
273+ excludedIPsString := esSettings .Transient .Cluster .Routing .Allocation .Exclude .IP
274+ excludedIPs := strings .Split (excludedIPsString , "," )
275+ var newExcludedIPs []string
276+ for _ , excludeIP := range excludedIPs {
277+ if excludeIP != podIP {
278+ newExcludedIPs = append (newExcludedIPs , excludeIP )
279+ sort .Strings (newExcludedIPs )
280+ }
281+ }
282+
283+ newExcludedIPsString := strings .Join (newExcludedIPs , "," )
284+ if newExcludedIPsString != excludedIPsString {
285+ c .logger ().Infof ("Setting exclude list to '%s'" , newExcludedIPsString )
286+
287+ err = c .setExcludeIPs (newExcludedIPsString )
288+ if err != nil {
289+ return err
290+ }
291+ }
292+
293+ return nil
294+ }
295+
259296func (c * ESClient ) updateAutoRebalance (value string ) error {
260297 resp , err := resty .New ().R ().
261298 SetHeader ("Content-Type" , "application/json" ).
@@ -276,13 +313,13 @@ func (c *ESClient) updateAutoRebalance(value string) error {
276313}
277314
278315// repeatedly query shard allocations to ensure success of drain operation.
279- func (c * ESClient ) waitForEmptyEsNode (ctx context.Context , pod * v1.Pod ) error {
316+ func (c * ESClient ) waitForEmptyEsNode (ctx context.Context , pod * v1.Pod , config * RestyConfig ) error {
280317 // TODO: implement context handling
281318 podIP := pod .Status .PodIP
282- _ , err := resty .New ().
283- SetRetryCount (defaultRetryCount ).
284- SetRetryWaitTime (defaultRetryWaitTime ).
285- SetRetryMaxWaitTime (defaultRetryMaxWaitTime ).
319+ resp , err := resty .New ().
320+ SetRetryCount (config . ClientRetryCount ).
321+ SetRetryWaitTime (config . ClientRetryWaitTime ).
322+ SetRetryMaxWaitTime (config . ClientRetryMaxWaitTime ).
286323 AddRetryCondition (
287324 // It is expected to return (bool, error) pair. Resty will retry
288325 // in case condition returns true or non nil error.
@@ -292,7 +329,6 @@ func (c *ESClient) waitForEmptyEsNode(ctx context.Context, pod *v1.Pod) error {
292329 if err != nil {
293330 return true , err
294331 }
295- // shardIP := make(map[string]bool)
296332 remainingShards := 0
297333 for _ , shard := range shards {
298334 if shard .IP == podIP {
@@ -312,9 +348,33 @@ func (c *ESClient) waitForEmptyEsNode(ctx context.Context, pod *v1.Pod) error {
312348 },
313349 ).R ().
314350 Get (c .Endpoint .String () + "/_cat/shards?h=index,ip&format=json" )
351+
315352 if err != nil {
316353 return err
317354 }
355+
356+ // make sure the IP is still excluded, this could have been updated in the meantime.
357+ if err = c .excludePodIP (pod ); err != nil {
358+ return err
359+ }
360+
361+ var shards []ESShard
362+ err = json .Unmarshal (resp .Body (), & shards )
363+ if err != nil {
364+ return err
365+ }
366+
367+ for _ , shard := range shards {
368+ if shard .IP == podIP {
369+ err = fmt .Errorf ("Cannot migrate shards from pod '%s' with IP '%s' within provided intervals" , pod .ObjectMeta .Name , pod .Status .PodIP )
370+ // if we cannot remove node than return it back active nodes pool
371+ if errExclude := c .removeFromExcludeIPList (pod ); errExclude != nil {
372+ return fmt .Errorf ("during handling request error: '%v' another error has been raised '%v'" , err , errExclude )
373+ }
374+ return err
375+ }
376+ }
377+
318378 return nil
319379}
320380
@@ -452,7 +512,7 @@ func (c *ESClient) CreateIndex(indexName, groupName string, shards, replicas int
452512 SetHeader ("Content-Type" , "application/json" ).
453513 SetBody ([]byte (
454514 fmt .Sprintf (
455- `{"settings": {"index" : {"number_of_replicas" : "%d", "number_of_shards": "%d",
515+ `{"settings": {"index" : {"number_of_replicas" : "%d", "number_of_shards": "%d",
456516"routing.allocation.include.group": "%s"}}}` ,
457517 replicas ,
458518 shards ,
0 commit comments