Skip to content

Commit 25dcc4d

Browse files
kriscolemanclaude
andauthored
fix(kotsadm): add wait-for-rqlite init container before schemahero migrations (#5925)
* fix(kotsadm): add wait-for-rqlite init container before schemahero When kotsadm and rqlite restart simultaneously (e.g., during EC upgrades), schemahero-plan runs before rqlite accepts connections, causing CrashLoopBackOff with "tried all peers unsuccessfully". Insert a wait-for-rqlite init container at position 0 in both the Deployment and StatefulSet that polls http://kotsadm-rqlite:4001/readyz until rqlite reports ready. This prevents the race between schemahero and rqlite startup. Affects both KotsadmDeployment() and KotsadmStatefulSet() init container lists. Closes replicated-collab/netbox-replicated#149 Ref: replicated-collab/netbox-replicated#148 Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com> * fix(kotsadm): add 5-min timeout, use lighter image, add unit tests Address review findings: - Add 5-minute timeout to wait loop so rqlite failures surface as a clear init error rather than an indefinite hang - Use kotsadm-migrations image (lighter, already pulled for schemahero) - Replace private repo link with Shortcut story reference - Add unit tests for waitForRqliteInitContainer() and verify init container ordering in KotsadmDeployment() Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com> * refactor(kotsadm): extract wait-for-rqlite script to embedded .sh file Per review feedback: move the inline shell script out of the Go source and load it via go:embed, matching the existing pattern used by minio_objects.go for scripts/{copy,export,import}-minio-data.sh. Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com> --------- Co-authored-by: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
1 parent 8de414a commit 25dcc4d

3 files changed

Lines changed: 99 additions & 0 deletions

File tree

pkg/kotsadm/objects/kotsadm_objects.go

Lines changed: 33 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,8 +1,10 @@
11
package kotsadm
22

33
import (
4+
_ "embed"
45
"fmt"
56
"os"
7+
"strings"
68

79
"github.com/pkg/errors"
810
"github.com/replicatedhq/kots/pkg/ingress"
@@ -20,6 +22,9 @@ import (
2022
"k8s.io/utils/pointer"
2123
)
2224

25+
//go:embed scripts/wait-for-rqlite.sh
26+
var waitForRqliteScript string
27+
2328
func KotsadmClusterRole() *rbacv1.ClusterRole {
2429
clusterRole := &rbacv1.ClusterRole{
2530
TypeMeta: metav1.TypeMeta{
@@ -195,6 +200,32 @@ func updateKotsadmDeploymentScriptsPath(existing *appsv1.Deployment) {
195200
}
196201
}
197202

203+
// waitForRqliteInitContainer returns an init container that polls the rqlite
204+
// readiness endpoint before schemahero-plan runs. This prevents CrashLoopBackOff
205+
// when kotsadm and rqlite restart simultaneously (e.g., during EC upgrades).
206+
// Times out after 5 minutes so rqlite failures surface as a clear init error
207+
// rather than an indefinite hang.
208+
// Ref: https://app.shortcut.com/replicated/story/138103
209+
func waitForRqliteInitContainer(deployOptions types.DeployOptions) corev1.Container {
210+
return corev1.Container{
211+
Image: GetAdminConsoleImage(deployOptions, "kotsadm-migrations"),
212+
ImagePullPolicy: corev1.PullIfNotPresent,
213+
Name: "wait-for-rqlite",
214+
Command: []string{"sh", "-c"},
215+
Args: []string{strings.TrimSpace(waitForRqliteScript)},
216+
Resources: corev1.ResourceRequirements{
217+
Limits: corev1.ResourceList{
218+
"memory": resource.MustParse("50Mi"),
219+
},
220+
Requests: corev1.ResourceList{
221+
"cpu": resource.MustParse("10m"),
222+
"memory": resource.MustParse("10Mi"),
223+
},
224+
},
225+
SecurityContext: k8sutil.SecureContainerContext(deployOptions.StrictSecurityContext),
226+
}
227+
}
228+
198229
func KotsadmDeployment(deployOptions types.DeployOptions) (*appsv1.Deployment, error) {
199230
securityContext := k8sutil.SecurePodContext(1001, 1001, deployOptions.StrictSecurityContext)
200231
if deployOptions.IsOpenShift {
@@ -493,6 +524,7 @@ func KotsadmDeployment(deployOptions types.DeployOptions) (*appsv1.Deployment, e
493524
RestartPolicy: corev1.RestartPolicyAlways,
494525
ImagePullSecrets: pullSecrets,
495526
InitContainers: []corev1.Container{
527+
waitForRqliteInitContainer(deployOptions),
496528
{
497529
Image: GetAdminConsoleImage(deployOptions, "kotsadm-migrations"),
498530
ImagePullPolicy: corev1.PullIfNotPresent,
@@ -1086,6 +1118,7 @@ func KotsadmStatefulSet(deployOptions types.DeployOptions, size resource.Quantit
10861118
RestartPolicy: corev1.RestartPolicyAlways,
10871119
ImagePullSecrets: pullSecrets,
10881120
InitContainers: []corev1.Container{
1121+
waitForRqliteInitContainer(deployOptions),
10891122
{
10901123
Image: GetAdminConsoleImage(deployOptions, "kotsadm-migrations"),
10911124
ImagePullPolicy: corev1.PullIfNotPresent,

pkg/kotsadm/objects/kotsadm_objects_test.go

Lines changed: 44 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,9 +1,12 @@
11
package kotsadm
22

33
import (
4+
"strings"
45
"testing"
56

7+
"github.com/replicatedhq/kots/pkg/kotsadm/types"
68
"github.com/stretchr/testify/assert"
9+
"github.com/stretchr/testify/require"
710
appsv1 "k8s.io/api/apps/v1"
811
corev1 "k8s.io/api/core/v1"
912
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
@@ -226,3 +229,44 @@ func Test_updateKotsadmDeploymentScriptsPath(t *testing.T) {
226229
})
227230
}
228231
}
232+
233+
func Test_waitForRqliteInitContainer(t *testing.T) {
234+
opts := types.DeployOptions{
235+
Namespace: "default",
236+
StrictSecurityContext: true,
237+
}
238+
c := waitForRqliteInitContainer(opts)
239+
240+
assert.Equal(t, "wait-for-rqlite", c.Name)
241+
assert.Equal(t, corev1.PullIfNotPresent, c.ImagePullPolicy)
242+
assert.Equal(t, []string{"sh", "-c"}, c.Command)
243+
require.Len(t, c.Args, 1)
244+
245+
// Polls /readyz
246+
assert.Contains(t, c.Args[0], "kotsadm-rqlite:4001/readyz")
247+
// Has a timeout (not an infinite loop)
248+
assert.Contains(t, c.Args[0], "timeout=300")
249+
// Exits non-zero on timeout
250+
assert.True(t, strings.HasSuffix(c.Args[0], "exit 1"))
251+
252+
// Resource requests are set
253+
assert.NotNil(t, c.Resources.Requests.Cpu())
254+
assert.NotNil(t, c.Resources.Requests.Memory())
255+
assert.NotNil(t, c.Resources.Limits.Memory())
256+
257+
// Security context is set
258+
assert.NotNil(t, c.SecurityContext)
259+
}
260+
261+
func Test_kotsadmDeploymentHasWaitForRqlite(t *testing.T) {
262+
opts := types.DeployOptions{
263+
Namespace: "default",
264+
}
265+
dep, err := KotsadmDeployment(opts)
266+
require.NoError(t, err)
267+
268+
initContainers := dep.Spec.Template.Spec.InitContainers
269+
require.True(t, len(initContainers) >= 4, "expected at least 4 init containers, got %d", len(initContainers))
270+
assert.Equal(t, "wait-for-rqlite", initContainers[0].Name, "wait-for-rqlite should be the first init container")
271+
assert.Equal(t, "schemahero-plan", initContainers[1].Name, "schemahero-plan should follow wait-for-rqlite")
272+
}
Lines changed: 22 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,22 @@
1+
#!/bin/sh
2+
# Polls the rqlite readiness endpoint before schemahero-plan runs.
3+
# Prevents CrashLoopBackOff when kotsadm and rqlite restart simultaneously
4+
# (e.g., during Embedded Cluster upgrades).
5+
# Times out after 5 minutes so rqlite failures surface as a clear init error
6+
# rather than an indefinite hang.
7+
8+
timeout=300
9+
elapsed=0
10+
11+
while [ $elapsed -lt $timeout ]; do
12+
if wget -qO- http://kotsadm-rqlite:4001/readyz 2>/dev/null | grep -q "ok"; then
13+
echo "rqlite is ready (${elapsed}s)"
14+
exit 0
15+
fi
16+
echo "Waiting for rqlite... (${elapsed}s/${timeout}s)"
17+
sleep 2
18+
elapsed=$((elapsed+2))
19+
done
20+
21+
echo "ERROR: rqlite not ready after ${timeout}s"
22+
exit 1

0 commit comments

Comments
 (0)