Skip to content

Commit f2ead04

Browse files
authored
fix(shared-mounts): harden init fetch (#107)
1 parent 298cd9d commit f2ead04

6 files changed

Lines changed: 212 additions & 14 deletions

File tree

api/cmd/shared-syncer/main.go

Lines changed: 125 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -12,6 +12,7 @@ import (
1212
"fmt"
1313
"io"
1414
"log"
15+
"net"
1516
"net/http"
1617
"net/url"
1718
"os"
@@ -37,6 +38,17 @@ const (
3738
publishSuppressAfterApply = 2 * time.Second
3839
)
3940

41+
var (
42+
initRetryWindow = 2 * time.Minute
43+
initRetryBackoff = 2 * time.Second
44+
initLatestRequestTTL = 15 * time.Second
45+
initApplyRequestTTL = 60 * time.Second
46+
sharedMountDialTimeout = 5 * time.Second
47+
sharedMountKeepAlive = 30 * time.Second
48+
sharedMountHeaderTTL = 30 * time.Second
49+
sharedMountIdleConnTTL = 90 * time.Second
50+
)
51+
4052
type sharedMountClient struct {
4153
baseURL string
4254
token string
@@ -71,7 +83,19 @@ func main() {
7183
token: token,
7284
// Long-polling calls can legitimately hold the connection open.
7385
// Prefer per-request timeouts (via context) over a tight global client timeout.
74-
client: &http.Client{Timeout: 5 * time.Minute},
86+
client: &http.Client{
87+
Timeout: 5 * time.Minute,
88+
Transport: &http.Transport{
89+
Proxy: http.ProxyFromEnvironment,
90+
DialContext: (&net.Dialer{Timeout: sharedMountDialTimeout, KeepAlive: sharedMountKeepAlive}).DialContext,
91+
ForceAttemptHTTP2: true,
92+
MaxIdleConns: 100,
93+
IdleConnTimeout: sharedMountIdleConnTTL,
94+
TLSHandshakeTimeout: sharedMountDialTimeout,
95+
ExpectContinueTimeout: 1 * time.Second,
96+
ResponseHeaderTimeout: sharedMountHeaderTTL,
97+
},
98+
},
7599
}
76100

77101
state := make([]*sharedMountState, 0, len(mounts))
@@ -134,23 +158,99 @@ func runInit(ctx context.Context, logger *log.Logger, client *sharedMountClient,
134158
if err := ensureMountPath(state.spec.MountPath); err != nil {
135159
return err
136160
}
137-
manifest, found, err := client.latest(ctx, ownerID, state.spec.Name)
138-
if err != nil {
161+
if err := runInitMount(ctx, logger, client, ownerID, state); err != nil {
139162
return err
140163
}
141-
if !found {
142-
continue
164+
}
165+
logger.Print("init complete")
166+
return nil
167+
}
168+
169+
func runInitMount(ctx context.Context, logger *log.Logger, client *sharedMountClient, ownerID string, state *sharedMountState) error {
170+
deadline := time.Now().Add(initRetryWindow)
171+
attempt := 0
172+
for {
173+
attempt++
174+
err := runInitMountAttempt(ctx, client, ownerID, state)
175+
if err == nil {
176+
return nil
143177
}
144-
if err := applyRevision(ctx, client, ownerID, state.spec, manifest.Revision); err != nil {
178+
if !isRetryableInitError(err) || time.Now().After(deadline) {
145179
return err
146180
}
147-
state.currentRevision = manifest.Revision
148-
state.currentChecksum = manifest.Checksum
181+
logger.Printf("init retry for %s attempt=%d after error: %v", state.spec.Name, attempt, err)
182+
select {
183+
case <-ctx.Done():
184+
return ctx.Err()
185+
case <-time.After(initRetryBackoff):
186+
}
149187
}
150-
logger.Print("init complete")
188+
}
189+
190+
func runInitMountAttempt(ctx context.Context, client *sharedMountClient, ownerID string, state *sharedMountState) error {
191+
latestCtx, cancelLatest := context.WithTimeout(ctx, initLatestRequestTTL)
192+
defer cancelLatest()
193+
194+
manifest, found, err := client.latest(latestCtx, ownerID, state.spec.Name)
195+
if err != nil {
196+
return err
197+
}
198+
if !found {
199+
return nil
200+
}
201+
202+
applyCtx, cancelApply := context.WithTimeout(ctx, initApplyRequestTTL)
203+
defer cancelApply()
204+
205+
if err := applyRevision(applyCtx, client, ownerID, state.spec, manifest.Revision); err != nil {
206+
return err
207+
}
208+
state.currentRevision = manifest.Revision
209+
state.currentChecksum = manifest.Checksum
151210
return nil
152211
}
153212

213+
type remoteHTTPError struct {
214+
StatusCode int
215+
Message string
216+
}
217+
218+
func (e *remoteHTTPError) Error() string {
219+
return e.Message
220+
}
221+
222+
func isRetryableInitError(err error) bool {
223+
if err == nil {
224+
return false
225+
}
226+
if errors.Is(err, context.DeadlineExceeded) {
227+
return true
228+
}
229+
var netErr net.Error
230+
if errors.As(err, &netErr) && netErr.Timeout() {
231+
return true
232+
}
233+
var urlErr *url.Error
234+
if errors.As(err, &urlErr) {
235+
if errors.Is(urlErr.Err, context.DeadlineExceeded) {
236+
return true
237+
}
238+
if errors.As(urlErr.Err, &netErr) && netErr.Timeout() {
239+
return true
240+
}
241+
}
242+
var httpErr *remoteHTTPError
243+
if errors.As(err, &httpErr) {
244+
return httpErr.StatusCode == http.StatusTooManyRequests || httpErr.StatusCode >= http.StatusInternalServerError
245+
}
246+
message := strings.ToLower(err.Error())
247+
return strings.Contains(message, "i/o timeout") ||
248+
strings.Contains(message, "connection reset by peer") ||
249+
strings.Contains(message, "connection refused") ||
250+
strings.Contains(message, "no route to host") ||
251+
strings.Contains(message, "unexpected eof")
252+
}
253+
154254
func runSidecar(ctx context.Context, logger *log.Logger, client *sharedMountClient, ownerID string, mounts []*sharedMountState) {
155255
for _, state := range mounts {
156256
state := state
@@ -876,7 +976,10 @@ func (c *sharedMountClient) latest(ctx context.Context, ownerID, mount string) (
876976
}
877977
if resp.StatusCode != http.StatusOK {
878978
body, _ := io.ReadAll(resp.Body)
879-
return sharedmounts.LatestManifest{}, false, fmt.Errorf("latest fetch failed: %s", strings.TrimSpace(string(body)))
979+
return sharedmounts.LatestManifest{}, false, &remoteHTTPError{
980+
StatusCode: resp.StatusCode,
981+
Message: fmt.Sprintf("latest fetch failed (%d): %s", resp.StatusCode, strings.TrimSpace(string(body))),
982+
}
880983
}
881984
body, err := io.ReadAll(resp.Body)
882985
if err != nil {
@@ -985,7 +1088,10 @@ func (c *sharedMountClient) downloadRevision(ctx context.Context, ownerID, mount
9851088
defer resp.Body.Close()
9861089
if resp.StatusCode != http.StatusOK {
9871090
body, _ := io.ReadAll(resp.Body)
988-
return fmt.Errorf("revision fetch failed: %s", strings.TrimSpace(string(body)))
1091+
return &remoteHTTPError{
1092+
StatusCode: resp.StatusCode,
1093+
Message: fmt.Sprintf("revision fetch failed (%d): %s", resp.StatusCode, strings.TrimSpace(string(body))),
1094+
}
9891095
}
9901096
_, err = io.Copy(dest, resp.Body)
9911097
return err
@@ -1016,7 +1122,10 @@ func (c *sharedMountClient) uploadRevision(ctx context.Context, ownerID, mount,
10161122
defer resp.Body.Close()
10171123
if resp.StatusCode != http.StatusOK {
10181124
body, _ := io.ReadAll(resp.Body)
1019-
return fmt.Errorf("revision upload failed: %s", strings.TrimSpace(string(body)))
1125+
return &remoteHTTPError{
1126+
StatusCode: resp.StatusCode,
1127+
Message: fmt.Sprintf("revision upload failed (%d): %s", resp.StatusCode, strings.TrimSpace(string(body))),
1128+
}
10201129
}
10211130
return nil
10221131
}
@@ -1046,7 +1155,10 @@ func (c *sharedMountClient) updateLatest(ctx context.Context, ownerID, mount str
10461155
}
10471156
if resp.StatusCode != http.StatusOK {
10481157
body, _ := io.ReadAll(resp.Body)
1049-
return fmt.Errorf("latest update failed: %s", strings.TrimSpace(string(body)))
1158+
return &remoteHTTPError{
1159+
StatusCode: resp.StatusCode,
1160+
Message: fmt.Sprintf("latest update failed (%d): %s", resp.StatusCode, strings.TrimSpace(string(body))),
1161+
}
10501162
}
10511163
return nil
10521164
}

api/cmd/shared-syncer/main_test.go

Lines changed: 56 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -7,6 +7,7 @@ import (
77
"context"
88
"encoding/json"
99
"io"
10+
"log"
1011
"net/http"
1112
"net/http/httptest"
1213
"os"
@@ -15,6 +16,8 @@ import (
1516
"strings"
1617
"testing"
1718
"time"
19+
20+
"spritz.sh/operator/sharedmounts"
1821
)
1922

2023
func TestUploadRevisionSetsContentLength(t *testing.T) {
@@ -154,6 +157,59 @@ func TestLatestRejectsInvalidPayload(t *testing.T) {
154157
}
155158
}
156159

160+
func TestRunInitRetriesTransientLatestTimeout(t *testing.T) {
161+
originalRetryWindow := initRetryWindow
162+
originalRetryBackoff := initRetryBackoff
163+
originalLatestTTL := initLatestRequestTTL
164+
originalApplyTTL := initApplyRequestTTL
165+
t.Cleanup(func() {
166+
initRetryWindow = originalRetryWindow
167+
initRetryBackoff = originalRetryBackoff
168+
initLatestRequestTTL = originalLatestTTL
169+
initApplyRequestTTL = originalApplyTTL
170+
})
171+
172+
initRetryWindow = 200 * time.Millisecond
173+
initRetryBackoff = 5 * time.Millisecond
174+
initLatestRequestTTL = 20 * time.Millisecond
175+
initApplyRequestTTL = 50 * time.Millisecond
176+
177+
attempts := 0
178+
srv := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
179+
if !strings.HasSuffix(r.URL.Path, "/latest") {
180+
t.Fatalf("unexpected path: %s", r.URL.Path)
181+
}
182+
attempts++
183+
if attempts == 1 {
184+
time.Sleep(60 * time.Millisecond)
185+
return
186+
}
187+
w.WriteHeader(http.StatusNotFound)
188+
}))
189+
defer srv.Close()
190+
191+
client := &sharedMountClient{
192+
baseURL: srv.URL,
193+
token: "token",
194+
client: srv.Client(),
195+
}
196+
197+
state := []*sharedMountState{{
198+
spec: sharedmounts.MountSpec{
199+
Name: "config",
200+
Scope: sharedmounts.ScopeOwner,
201+
MountPath: t.TempDir(),
202+
},
203+
}}
204+
205+
if err := runInit(context.Background(), log.New(io.Discard, "", 0), client, "owner", state); err != nil {
206+
t.Fatalf("runInit failed: %v", err)
207+
}
208+
if attempts < 2 {
209+
t.Fatalf("expected init retry after transient timeout, got %d attempt(s)", attempts)
210+
}
211+
}
212+
157213
func TestEnsureEmptyLiveCreatesWritableCurrent(t *testing.T) {
158214
mountPath := filepath.Join(t.TempDir(), "mount")
159215
if err := ensureMountPath(mountPath); err != nil {

helm/spritz/templates/api-deployment.yaml

Lines changed: 9 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -8,7 +8,7 @@ metadata:
88
name: spritz-api
99
namespace: {{ .Values.api.namespace }}
1010
spec:
11-
replicas: 1
11+
replicas: {{ .Values.api.replicaCount }}
1212
selector:
1313
matchLabels:
1414
app.kubernetes.io/name: spritz-api
@@ -27,6 +27,14 @@ spec:
2727
{{- end }}
2828
spec:
2929
serviceAccountName: {{ .Values.api.serviceAccountName }}
30+
{{- with .Values.api.affinity }}
31+
affinity:
32+
{{- toYaml . | nindent 8 }}
33+
{{- end }}
34+
{{- with .Values.api.topologySpreadConstraints }}
35+
topologySpreadConstraints:
36+
{{- toYaml . | nindent 8 }}
37+
{{- end }}
3038
containers:
3139
- name: api
3240
image: {{ .Values.api.image }}

helm/spritz/templates/api-pdb.yaml

Lines changed: 12 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,12 @@
1+
{{- if and .Values.api.podDisruptionBudget.enabled (gt (int .Values.api.replicaCount) 1) }}
2+
apiVersion: policy/v1
3+
kind: PodDisruptionBudget
4+
metadata:
5+
name: spritz-api
6+
namespace: {{ .Values.api.namespace }}
7+
spec:
8+
minAvailable: {{ .Values.api.podDisruptionBudget.minAvailable }}
9+
selector:
10+
matchLabels:
11+
app.kubernetes.io/name: spritz-api
12+
{{- end }}

helm/spritz/values.yaml

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -116,13 +116,19 @@ operator:
116116
- ALL
117117

118118
api:
119+
replicaCount: 1
119120
image: spritz-api:latest
120121
imagePullPolicy: IfNotPresent
121122
rolloutAt: ""
122123
namespace: spritz-system
123124
serviceAccountName: spritz-api
124125
defaultAnnotations: ""
125126
podAnnotations: {}
127+
affinity: {}
128+
topologySpreadConstraints: []
129+
podDisruptionBudget:
130+
enabled: false
131+
minAvailable: 1
126132
service:
127133
port: 8080
128134
auth:

scripts/verify-helm.sh

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -64,12 +64,14 @@ default_render="${tmp_dir}/default.yaml"
6464
auth_render="${tmp_dir}/auth.yaml"
6565
auth_annotations_render="${tmp_dir}/auth-annotations.yaml"
6666
acp_network_policy_render="${tmp_dir}/acp-network-policy.yaml"
67+
api_ha_render="${tmp_dir}/api-ha.yaml"
6768

6869
helm lint "${chart_dir}"
6970
helm template spritz "${chart_dir}" >"${default_render}"
7071
helm template spritz "${chart_dir}" -f "${example_values}" >"${auth_render}"
7172
helm template spritz "${chart_dir}" -f "${example_values}" --set authGateway.ingress.annotations.authonly=enabled >"${auth_annotations_render}"
7273
helm template spritz "${chart_dir}" --set acp.networkPolicy.enabled=true >"${acp_network_policy_render}"
74+
helm template spritz "${chart_dir}" --set api.replicaCount=2 --set api.podDisruptionBudget.enabled=true >"${api_ha_render}"
7375

7476
expect_contains "${default_render}" "name: spritz-web" "spritz-web ingress in default render"
7577
expect_not_contains "${default_render}" "name: spritz-auth" "spritz-auth ingress when auth gateway is disabled"
@@ -82,6 +84,8 @@ expect_contains "${auth_render}" "nginx.ingress.kubernetes.io/configuration-snip
8284
expect_contains "${auth_annotations_render}" "authonly: enabled" "auth ingress custom annotations in auth render"
8385
expect_contains "${acp_network_policy_render}" "kind: NetworkPolicy" "ACP network policy when enabled"
8486
expect_contains "${acp_network_policy_render}" "name: spritz-acp" "ACP network policy name when enabled"
87+
expect_contains "${api_ha_render}" "kind: PodDisruptionBudget" "API pod disruption budget when enabled"
88+
expect_contains "${api_ha_render}" "name: spritz-api" "API pod disruption budget name when enabled"
8589
expect_contains "${default_render}" 'resources: ["spritzes/status", "spritzconversations/status"]' "status RBAC for spritz conversations"
8690
expect_contains "${default_render}" "name: SPRITZ_AUTH_HEADER_TYPE" "principal type auth header wiring"
8791
expect_contains "${default_render}" "name: SPRITZ_AUTH_BEARER_SCOPES_PATHS" "bearer scope path wiring"

0 commit comments

Comments
 (0)