Skip to content
This repository was archived by the owner on Sep 30, 2020. It is now read-only.

Commit 5147dd1

Browse files
Add a new service that allows kube-aws upgrades to disable core services (kube-apiserver, kube-controller-manager and kube-scheduler) on existing/legacy controllers by sending them a request to do so via a special configmap. (#1639)
Update the handle-cluster-cidr-changes to use the functionality instead of its own custom job - which was unreliable.
1 parent 7ed79b3 commit 5147dd1

File tree

1 file changed

+162
-93
lines changed

1 file changed

+162
-93
lines changed

core/controlplane/config/templates/cloud-config-controller

+162-93
Original file line numberDiff line numberDiff line change
@@ -54,6 +54,26 @@ coreos:
5454
ExecStart=/usr/bin/sh -c 'for u in update-engine locksmithd; do systemctl stop $${u}.service; systemctl mask $${u}.service; systemctl disable $${u}.service; done; systemctl reset-failed'
5555
{{end}}
5656

57+
- name: handle-disable-request.service
58+
enable: true
59+
command: start
60+
content: |
61+
[Unit]
62+
Description=Shuts down core services when requested
63+
After=kubelet.service network-online.target
64+
Wants=kubelet.service
65+
66+
[Service]
67+
Type=simple
68+
TimeoutStartSec=60m
69+
Restart=on-failure
70+
RestartSec=30
71+
ExecStartPre=/usr/bin/systemctl is-active kubelet
72+
ExecStart=/opt/bin/handle-disable-request
73+
74+
[Install]
75+
WantedBy=multi-user.target
76+
5777
- name: handle-cluster-cidr-changes.service
5878
enable: true
5979
command: start
@@ -5916,6 +5936,128 @@ write_files:
59165936
fi
59175937
done
59185938

5939+
- path: /opt/bin/handle-disable-request
5940+
permissions: 0755
5941+
content: |
5942+
#!/bin/bash
5943+
# Allows a controller to disable its core services upon request
5944+
# Created to allow more ambitious kubernetes upgrades
5945+
# and changes such as changing cluster settings such as service_cidr or pod_cidr
5946+
#
5947+
# A request to disable is a configmap matching the hostname and kubernetes version containing a list of core service to stop: -
5948+
# apiVersion: v1
5949+
# kind: ConfigMap
5950+
# metadata:
5951+
# name: kube-aws-migration-disable-ip-10-29-26-83.us-west-2.compute.internal
5952+
# namespace: kube-system
5953+
# data:
5954+
# kubernetesVersion: v1.9.3
5955+
# disable: "kube-apiserver kube-controller-manager kube-scheduler"
5956+
5957+
retries=5
5958+
hyperkube_image="{{.HyperkubeImage.RepoWithTag}}"
5959+
my_kubernetes_version="{{.HyperkubeImage.Tag}}"
5960+
myhostname=$(hostname -f)
5961+
disable_confmap_name="kube-aws-migration-disable-${myhostname}"
5962+
valid_services="kube-apiserver kube-controller-manager kube-scheduler"
5963+
5964+
kubectl() {
5965+
local tries=0
5966+
local result_text=""
5967+
local return_code=0
5968+
5969+
while [ "$tries" -lt "$retries" ]; do
5970+
result_text=$(docker run --rm -i --net=host -v /tmp:/tmp:rw -v /etc/kubernetes:/etc/kubernetes:ro -v /etc/resolv.conf:/etc/resolv.conf:ro $hyperkube_image /kubectl "$@")
5971+
return_code=$?
5972+
if [ "$return_code" -eq "0" ]; then
5973+
echo "${result_text}"
5974+
break
5975+
fi
5976+
sleep 10
5977+
tries=$((tries+1))
5978+
done
5979+
return $return_code
5980+
}
5981+
5982+
log() {
5983+
echo "$@" >&2
5984+
}
5985+
5986+
get_disable_request() {
5987+
kubectl get cm -n kube-system $disable_confmap_name -o json --ignore-not-found
5988+
}
5989+
5990+
valid_disable_request() {
5991+
local disable_payload=$1
5992+
5993+
if [[ -n "${disable_payload}" ]]; then
5994+
log "found a disable request"
5995+
local kubernetes_version=$(echo ${disable_payload} | jq -er '.data.kubernetesVersion')
5996+
if [[ "${kubernetes_version}" == "${my_kubernetes_version}" ]]; then
5997+
log "valid request: kubernetes version match: ${kubernetes_version}"
5998+
return 0
5999+
else
6000+
log "invalid request: kubernetes version ${kubernetes_version} does not match my version ${my_kubernetes_version}"
6001+
return 1
6002+
fi
6003+
fi
6004+
log "no disable request found"
6005+
return 1
6006+
}
6007+
6008+
valid_service() {
6009+
for s in $valid_services; do
6010+
if [[ "$s" == $1 ]]; then
6011+
return 0
6012+
fi
6013+
done
6014+
return 1
6015+
}
6016+
6017+
disable_service() {
6018+
local service=$1
6019+
6020+
if [[ -f "/etc/kubernetes/manifests/${service}.yaml" ]]; then
6021+
log "Moving manifest /etc/kubernetes/manifests/${service}.yaml to /etc/kubernetes/${service}.yaml"
6022+
mv /etc/kubernetes/manifests/${service}.yaml /etc/kubernetes/${service}.yaml
6023+
else
6024+
log "No manifest found when looking for /etc/kubernetes/manifests/${service}.yaml"
6025+
fi
6026+
6027+
local container=$(docker ps | grep "k8s_${service}" | awk '{print $1}')
6028+
if [[ -n "${container}" ]]; then
6029+
log "stopping ${service} container ${container}..."
6030+
docker stop $container && docker rm $container
6031+
else
6032+
log "no docker container found matching k8s_${service}"
6033+
fi
6034+
}
6035+
6036+
# MAIN
6037+
6038+
log "Running watcher for requests to disable core services..."
6039+
while true
6040+
do
6041+
log "checking disable request kube-system/${disable_confmap_name} ..."
6042+
request=$(get_disable_request)
6043+
if valid_disable_request "${request}"; then
6044+
log "I've received a valid disable request!"
6045+
disable=$(echo "${request}" | jq -erc '.data.disable')
6046+
for d in ${disable}; do
6047+
log "disabling $d..."
6048+
if valid_service $d; then
6049+
disable_service $d
6050+
else
6051+
log "ERROR: service %d is not valid - valid services are ${valid_services}"
6052+
fi
6053+
done
6054+
else
6055+
log "no request to disable services found"
6056+
fi
6057+
6058+
sleep 10
6059+
done
6060+
59196061
- path: /opt/bin/handle-cluster-cidr-changes
59206062
permissions: 0755
59216063
content: |
@@ -6234,104 +6376,31 @@ write_files:
62346376
rm -f ${tmpfile}
62356377
}
62366378

6237-
# curl a controller by its healthz port (10252), if it fails then the controller isn't running.
6238-
controller_running() {
6379+
# stop a controller by writing a special kube-aws disable service configmap
6380+
disable_controller() {
62396381
local controller=$1
6382+
local version=$2
62406383

6241-
curl -s --fail --connect-timeout 2 ${controller}:10252/healthz 2>&1 >/dev/null
6242-
}
6243-
6244-
# stop a controller by running a job to remove its manifests from /etc/kubernetes/manifests
6245-
shoot_controller_in_head() {
6246-
local controller=$1
6247-
local return_value=0
6248-
6249-
local jobspec="$(cat <<EOT
6250-
apiVersion: batch/v1
6251-
kind: Job
6384+
local request="$(cat <<EOT
6385+
apiVersion: v1
6386+
kind: ConfigMap
62526387
metadata:
6253-
name: kill-master-${controller}
6254-
spec:
6255-
template:
6256-
metadata:
6257-
annotations:
6258-
scheduler.alpha.kubernetes.io/critical-pod: ""
6259-
spec:
6260-
hostNetwork: true
6261-
nodeSelector:
6262-
kubernetes.io/hostname: ${controller}
6263-
tolerations:
6264-
# Tolerate this effect so the pods will be schedulable at all times
6265-
- effect: NoSchedule
6266-
operator: Exists
6267-
- effect: NoExecute
6268-
operator: Exists
6269-
- key: CriticalAddonsOnly
6270-
operator: Exists
6271-
containers:
6272-
- name: bang-bang-night-night
6273-
image: ${hyperkube_image}
6274-
command:
6275-
- /bin/sh
6276-
- -c
6277-
- mv /etc/kubernetes/manifests/*.yaml /etc/kubernetes/
6278-
volumeMounts:
6279-
- mountPath: /etc/kubernetes
6280-
name: etc-kubernetes
6281-
securityContext:
6282-
privileged: true
6283-
restartPolicy: Never
6284-
volumes:
6285-
- name: etc-kubernetes
6286-
hostPath:
6287-
path: /etc/kubernetes
6288-
backoffLimit: 0
6388+
name: kube-aws-migration-disable-${controller}
6389+
namespace: kube-system
6390+
data:
6391+
kubernetesVersion: ${version}
6392+
disable: "kube-controller-manager"
62896393
EOT
62906394
)"
6395+
6396+
log "Creating disable service configmap kubw-system/kube-aws-migration-disable-${controller}"
6397+
echo "${request}" | kubectl -n kube-system create -f - || return 1
6398+
return 0
6399+
}
62916400

6292-
log "Creating kubernetes job to kill the kubernetes control-plane on ${controller}"
6293-
echo "${jobspec}" | kubectl -n kube-system create -f - || return 1
6294-
6295-
local started_time=$(date +%s)
6296-
while [ "$(date +%s)" -lt "$((started_time+job_timeout_seconds))" ]; do
6297-
if status=$(kubectl -n kube-system get job "kill-master-${controller}" -o json | jq -r '.status'); then
6298-
[[ "$(echo $status | jq -r '.conditions[0].type')" =~ Complete|Failed ]] && break
6299-
fi
6300-
log "Waiting for job to complete..."
6301-
sleep 10
6302-
done
6303-
6304-
# Check that the job succeeded
6305-
if [[ "$(echo $status | jq -r '.conditions[0].type')" == "Failed" ]]; then
6306-
log "Job kill-master-${controller} failed."
6307-
log "Failure message: $(echo $status | jq -r .conditions[0].message)"
6308-
return_value=1
6309-
else
6310-
log "Job kill-master-${controller} succeeded"
6311-
fi
6312-
6313-
log "Cleaning up the job.."
6314-
kubectl -n kube-system delete job kill-master-${controller} || return_value=1
6315-
6316-
# Makes sure that the control-plane containers have stopped...
6317-
for pod in kube-controller-manager kube-apiserver kube-scheduler; do
6318-
if pod_exists kube-system "${pod}-${node}"; then
6319-
log "Killing running pod ${pod}-${node}..."
6320-
delete_pod kube-system "${pod}-${node}"
6321-
fi
6322-
done
6323-
6324-
local started_time=$(date +%s)
6325-
while controller_running ${controller}; do
6326-
if [ "$(date +%s)" -gt "$((started_time+job_timeout_seconds))" ]; then
6327-
log "Timed out waiting for controller to stop!"
6328-
break
6329-
fi
6330-
log "Waiting for contoller to actually stop..."
6331-
sleep 10
6332-
done
6333-
6334-
return $return_value
6401+
node_version() {
6402+
local node=$1
6403+
kubectl get node $node --no-headers --ignore-not-found | awk '{print $5}'
63356404
}
63366405

63376406
# serviceCIDRmatch - looks at a nodes labels for a service-cidr label that matches the current known servicecidr.
@@ -6389,7 +6458,7 @@ write_files:
63896458
action_stop_controller=1
63906459
fi
63916460

6392-
[[ "${action_stop_controller}" == "1" ]] && controller_running $node && shoot_controller_in_head $node
6461+
[[ "${action_stop_controller}" == "1" ]] && disable_controller $node $(node_version $node)
63936462
[[ "${action_delete_node}" == "1" ]] && delete_node $node
63946463
done
63956464
}

0 commit comments

Comments
 (0)