@@ -54,6 +54,26 @@ coreos:
54
54
ExecStart=/usr/bin/sh -c 'for u in update-engine locksmithd; do systemctl stop $${u}.service; systemctl mask $${u}.service; systemctl disable $${u}.service; done; systemctl reset-failed'
55
55
{{end}}
56
56
57
+ - name: handle-disable-request.service
58
+ enable: true
59
+ command: start
60
+ content: |
61
+ [Unit]
62
+ Description=Shuts down core services when requested
63
+ After=kubelet.service network-online.target
64
+ Wants=kubelet.service
65
+
66
+ [Service]
67
+ Type=simple
68
+ TimeoutStartSec=60m
69
+ Restart=on-failure
70
+ RestartSec=30
71
+ ExecStartPre=/usr/bin/systemctl is-active kubelet
72
+ ExecStart=/opt/bin/handle-disable-request
73
+
74
+ [Install]
75
+ WantedBy=multi-user.target
76
+
57
77
- name: handle-cluster-cidr-changes.service
58
78
enable: true
59
79
command: start
@@ -5916,6 +5936,128 @@ write_files:
5916
5936
fi
5917
5937
done
5918
5938
5939
+ - path: /opt/bin/handle-disable-request
5940
+ permissions: 0755
5941
+ content: |
5942
+ #!/bin/bash
5943
+ # Allows a controller to disable its core services upon request
5944
+ # Created to allow more ambitious kubernetes upgrades
5945
+ # and changes such as changing cluster settings such as service_cidr or pod_cidr
5946
+ #
5947
+ # A request to disable is a configmap matching the hostname and kubernetes version containing a list of core service to stop: -
5948
+ # apiVersion: v1
5949
+ # kind: ConfigMap
5950
+ # metadata:
5951
+ # name: kube-aws-migration-disable-ip-10-29-26-83.us-west-2.compute.internal
5952
+ # namespace: kube-system
5953
+ # data:
5954
+ # kubernetesVersion: v1.9.3
5955
+ # disable: "kube-apiserver kube-controller-manager kube-scheduler"
5956
+
5957
+ retries=5
5958
+ hyperkube_image="{{.HyperkubeImage.RepoWithTag}}"
5959
+ my_kubernetes_version="{{.HyperkubeImage.Tag}}"
5960
+ myhostname=$(hostname -f)
5961
+ disable_confmap_name="kube-aws-migration-disable-${myhostname}"
5962
+ valid_services="kube-apiserver kube-controller-manager kube-scheduler"
5963
+
5964
+ kubectl() {
5965
+ local tries=0
5966
+ local result_text=""
5967
+ local return_code=0
5968
+
5969
+ while [ "$tries" -lt "$retries" ]; do
5970
+ result_text=$(docker run --rm -i --net=host -v /tmp:/tmp:rw -v /etc/kubernetes:/etc/kubernetes:ro -v /etc/resolv.conf:/etc/resolv.conf:ro $hyperkube_image /kubectl "$@")
5971
+ return_code=$?
5972
+ if [ "$return_code" -eq "0" ]; then
5973
+ echo "${result_text}"
5974
+ break
5975
+ fi
5976
+ sleep 10
5977
+ tries=$((tries+1))
5978
+ done
5979
+ return $return_code
5980
+ }
5981
+
5982
+ log() {
5983
+ echo "$@" >&2
5984
+ }
5985
+
5986
+ get_disable_request() {
5987
+ kubectl get cm -n kube-system $disable_confmap_name -o json --ignore-not-found
5988
+ }
5989
+
5990
+ valid_disable_request() {
5991
+ local disable_payload=$1
5992
+
5993
+ if [[ -n "${disable_payload}" ]]; then
5994
+ log "found a disable request"
5995
+ local kubernetes_version=$(echo ${disable_payload} | jq -er '.data.kubernetesVersion')
5996
+ if [[ "${kubernetes_version}" == "${my_kubernetes_version}" ]]; then
5997
+ log "valid request: kubernetes version match: ${kubernetes_version}"
5998
+ return 0
5999
+ else
6000
+ log "invalid request: kubernetes version ${kubernetes_version} does not match my version ${my_kubernetes_version}"
6001
+ return 1
6002
+ fi
6003
+ fi
6004
+ log "no disable request found"
6005
+ return 1
6006
+ }
6007
+
6008
+ valid_service() {
6009
+ for s in $valid_services; do
6010
+ if [[ "$s" == $1 ]]; then
6011
+ return 0
6012
+ fi
6013
+ done
6014
+ return 1
6015
+ }
6016
+
6017
+ disable_service() {
6018
+ local service=$1
6019
+
6020
+ if [[ -f "/etc/kubernetes/manifests/${service}.yaml" ]]; then
6021
+ log "Moving manifest /etc/kubernetes/manifests/${service}.yaml to /etc/kubernetes/${service}.yaml"
6022
+ mv /etc/kubernetes/manifests/${service}.yaml /etc/kubernetes/${service}.yaml
6023
+ else
6024
+ log "No manifest found when looking for /etc/kubernetes/manifests/${service}.yaml"
6025
+ fi
6026
+
6027
+ local container=$(docker ps | grep "k8s_${service}" | awk '{print $1}')
6028
+ if [[ -n "${container}" ]]; then
6029
+ log "stopping ${service} container ${container}..."
6030
+ docker stop $container && docker rm $container
6031
+ else
6032
+ log "no docker container found matching k8s_${service}"
6033
+ fi
6034
+ }
6035
+
6036
+ # MAIN
6037
+
6038
+ log "Running watcher for requests to disable core services..."
6039
+ while true
6040
+ do
6041
+ log "checking disable request kube-system/${disable_confmap_name} ..."
6042
+ request=$(get_disable_request)
6043
+ if valid_disable_request "${request}"; then
6044
+ log "I've received a valid disable request!"
6045
+ disable=$(echo "${request}" | jq -erc '.data.disable')
6046
+ for d in ${disable}; do
6047
+ log "disabling $d..."
6048
+ if valid_service $d; then
6049
+ disable_service $d
6050
+ else
6051
+ log "ERROR: service %d is not valid - valid services are ${valid_services}"
6052
+ fi
6053
+ done
6054
+ else
6055
+ log "no request to disable services found"
6056
+ fi
6057
+
6058
+ sleep 10
6059
+ done
6060
+
5919
6061
- path: /opt/bin/handle-cluster-cidr-changes
5920
6062
permissions: 0755
5921
6063
content: |
@@ -6234,104 +6376,31 @@ write_files:
6234
6376
rm -f ${tmpfile}
6235
6377
}
6236
6378
6237
- # curl a controller by its healthz port (10252), if it fails then the controller isn't running.
6238
- controller_running () {
6379
+ # stop a controller by writing a special kube-aws disable service configmap
6380
+ disable_controller () {
6239
6381
local controller=$1
6382
+ local version=$2
6240
6383
6241
- curl -s --fail --connect-timeout 2 ${controller}:10252/healthz 2>&1 >/dev/null
6242
- }
6243
-
6244
- # stop a controller by running a job to remove its manifests from /etc/kubernetes/manifests
6245
- shoot_controller_in_head() {
6246
- local controller=$1
6247
- local return_value=0
6248
-
6249
- local jobspec="$(cat <<EOT
6250
- apiVersion: batch/v1
6251
- kind: Job
6384
+ local request="$(cat <<EOT
6385
+ apiVersion: v1
6386
+ kind: ConfigMap
6252
6387
metadata:
6253
- name: kill-master-${controller}
6254
- spec:
6255
- template:
6256
- metadata:
6257
- annotations:
6258
- scheduler.alpha.kubernetes.io/critical-pod: ""
6259
- spec:
6260
- hostNetwork: true
6261
- nodeSelector:
6262
- kubernetes.io/hostname: ${controller}
6263
- tolerations:
6264
- # Tolerate this effect so the pods will be schedulable at all times
6265
- - effect: NoSchedule
6266
- operator: Exists
6267
- - effect: NoExecute
6268
- operator: Exists
6269
- - key: CriticalAddonsOnly
6270
- operator: Exists
6271
- containers:
6272
- - name: bang-bang-night-night
6273
- image: ${hyperkube_image}
6274
- command:
6275
- - /bin/sh
6276
- - -c
6277
- - mv /etc/kubernetes/manifests/*.yaml /etc/kubernetes/
6278
- volumeMounts:
6279
- - mountPath: /etc/kubernetes
6280
- name: etc-kubernetes
6281
- securityContext:
6282
- privileged: true
6283
- restartPolicy: Never
6284
- volumes:
6285
- - name: etc-kubernetes
6286
- hostPath:
6287
- path: /etc/kubernetes
6288
- backoffLimit: 0
6388
+ name: kube-aws-migration-disable-${controller}
6389
+ namespace: kube-system
6390
+ data:
6391
+ kubernetesVersion: ${version}
6392
+ disable: "kube-controller-manager"
6289
6393
EOT
6290
6394
)"
6395
+
6396
+ log "Creating disable service configmap kubw-system/kube-aws-migration-disable-${controller}"
6397
+ echo "${request}" | kubectl -n kube-system create -f - || return 1
6398
+ return 0
6399
+ }
6291
6400
6292
- log "Creating kubernetes job to kill the kubernetes control-plane on ${controller}"
6293
- echo "${jobspec}" | kubectl -n kube-system create -f - || return 1
6294
-
6295
- local started_time=$(date +%s)
6296
- while [ "$(date +%s)" -lt "$((started_time+job_timeout_seconds))" ]; do
6297
- if status=$(kubectl -n kube-system get job "kill-master-${controller}" -o json | jq -r '.status'); then
6298
- [[ "$(echo $status | jq -r '.conditions[0].type')" =~ Complete|Failed ]] && break
6299
- fi
6300
- log "Waiting for job to complete..."
6301
- sleep 10
6302
- done
6303
-
6304
- # Check that the job succeeded
6305
- if [[ "$(echo $status | jq -r '.conditions[0].type')" == "Failed" ]]; then
6306
- log "Job kill-master-${controller} failed."
6307
- log "Failure message: $(echo $status | jq -r .conditions[0].message)"
6308
- return_value=1
6309
- else
6310
- log "Job kill-master-${controller} succeeded"
6311
- fi
6312
-
6313
- log "Cleaning up the job.."
6314
- kubectl -n kube-system delete job kill-master-${controller} || return_value=1
6315
-
6316
- # Makes sure that the control-plane containers have stopped...
6317
- for pod in kube-controller-manager kube-apiserver kube-scheduler; do
6318
- if pod_exists kube-system "${pod}-${node}"; then
6319
- log "Killing running pod ${pod}-${node}..."
6320
- delete_pod kube-system "${pod}-${node}"
6321
- fi
6322
- done
6323
-
6324
- local started_time=$(date +%s)
6325
- while controller_running ${controller}; do
6326
- if [ "$(date +%s)" -gt "$((started_time+job_timeout_seconds))" ]; then
6327
- log "Timed out waiting for controller to stop!"
6328
- break
6329
- fi
6330
- log "Waiting for contoller to actually stop..."
6331
- sleep 10
6332
- done
6333
-
6334
- return $return_value
6401
+ node_version() {
6402
+ local node=$1
6403
+ kubectl get node $node --no-headers --ignore-not-found | awk '{print $5}'
6335
6404
}
6336
6405
6337
6406
# serviceCIDRmatch - looks at a nodes labels for a service-cidr label that matches the current known servicecidr.
@@ -6389,7 +6458,7 @@ write_files:
6389
6458
action_stop_controller=1
6390
6459
fi
6391
6460
6392
- [[ "${action_stop_controller}" == "1" ]] && controller_running $node && shoot_controller_in_head $node
6461
+ [[ "${action_stop_controller}" == "1" ]] && disable_controller $node $(node_version $node)
6393
6462
[[ "${action_delete_node}" == "1" ]] && delete_node $node
6394
6463
done
6395
6464
}
0 commit comments