Skip to content

Commit 792b4ab

Browse files
authored
chart: Fix controller deployment liveness probe (#186)
The node plugin is daemonset living on host's network (loopback) so kubelet can reach it via localhost. The controller is living inside Pod's network namespace, so kubelet will not be able to reach it on localhost. Therefore, remove `localhost` from controller's liveness probe, and kubelet will instead use advertised Pod IP address. Also, expose health endpoints on all addresses. This ensures that the health endpoints are reachable on all addresses Pod is exposed on. Added the test to confirm all containers report ready status and 0 restarts before each test. The no-restarts check might be to strict, but let's start with that and if we detect it is causing issues, we can relax it later.
2 parents 50213db + f8e6466 commit 792b4ab

3 files changed

Lines changed: 28 additions & 2 deletions

File tree

charts/templates/lxd-csi-controller-deployment.yaml

Lines changed: 1 addition & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -145,7 +145,6 @@ spec:
145145
readOnly: true
146146
livenessProbe:
147147
httpGet:
148-
host: localhost
149148
path: /healthz
150149
port: 39008
151150
failureThreshold: 3
@@ -249,7 +248,7 @@ spec:
249248
args:
250249
- --v=2
251250
- --csi-address=$(CSI_ADDRESS)
252-
- --http-endpoint=127.0.0.1:39008
251+
- --http-endpoint=:39008
253252
- --probe-timeout=3s
254253
env:
255254
- name: CSI_ADDRESS

test/e2e/e2e_debug.go

Lines changed: 23 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -9,12 +9,35 @@ import (
99
"strings"
1010
"time"
1111

12+
"github.com/onsi/gomega"
1213
corev1 "k8s.io/api/core/v1"
1314
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
1415
"k8s.io/apimachinery/pkg/labels"
1516
"k8s.io/client-go/kubernetes"
1617
)
1718

19+
// waitContainersReady waits until all containers in the given namespace are ready.
20+
func waitContainersReady(ctx context.Context, client *kubernetes.Clientset, namespace string) {
21+
ctx, cancel := context.WithTimeout(ctx, 30*time.Second)
22+
defer cancel()
23+
24+
waitReady := func(g gomega.Gomega) {
25+
pods, err := client.CoreV1().Pods(namespace).List(ctx, metav1.ListOptions{})
26+
g.Expect(err).NotTo(gomega.HaveOccurred(), "Failed to list pods in namespace %q", namespace)
27+
g.Expect(pods.Items).NotTo(gomega.BeEmpty(), "No pods found in namespace %q", namespace)
28+
29+
for _, pod := range pods.Items {
30+
for _, cs := range pod.Status.ContainerStatuses {
31+
name := pod.Name + "/" + cs.Name
32+
g.Expect(cs.RestartCount).To(gomega.BeZero(), "Container %q has restarted", name)
33+
g.Expect(cs.Ready).To(gomega.BeTrue(), "Container %q is not ready", name)
34+
}
35+
}
36+
}
37+
38+
gomega.Eventually(waitReady).WithContext(ctx).Should(gomega.Succeed())
39+
}
40+
1841
func printControllerLogs(ctx context.Context, client *kubernetes.Clientset, namespace string, name string, since time.Time) {
1942
fmt.Printf("\n=== Controller logs ===\n")
2043

test/e2e/e2e_suite_test.go

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -150,6 +150,10 @@ func getTestLXDStoragePool(driver string) (poolName string, cleanup func()) {
150150
return poolName, cleanup
151151
}
152152

153+
var _ = ginkgo.BeforeEach(func(ctx ginkgo.SpecContext) {
154+
waitContainersReady(ctx, testutils.GetKubernetesClient(testutils.GetClientConfig()), "lxd-csi")
155+
})
156+
153157
var _ = ginkgo.AfterEach(func() {
154158
// Provide useful information when test fails.
155159
rep := ginkgo.CurrentSpecReport()

0 commit comments

Comments
 (0)