Skip to content

Commit c03fed0

Browse files
committed
Add retry logic for deploy commands to handle transient network errors
1 parent 54a1c93 commit c03fed0

File tree

1 file changed

+65
-5
lines changed

1 file changed

+65
-5
lines changed

test/preflight/fly_deploy_test.go

Lines changed: 65 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -94,7 +94,7 @@ func TestFlyDeployHAPlacement(t *testing.T) {
9494
// The backend may not have replicated the app record to all hosts yet when
9595
// creating the second machine for HA, resulting in "sql: no rows in result set" errors
9696
var lastError string
97-
require.EventuallyWithT(f, func(c *assert.CollectT) {
97+
require.EventuallyWithT(t, func(c *assert.CollectT) {
9898
result := f.FlyAllowExitFailure("deploy --buildkit --remote-only")
9999
if result.ExitCode() != 0 {
100100
stderr := result.StdErrString()
@@ -241,10 +241,40 @@ func testDeployNodeAppWithRemoteBuilder(tt *testing.T) {
241241
require.NoError(t, err)
242242

243243
t.Logf("deploy %s", appName)
244-
f.Fly("deploy --buildkit --remote-only --ha=false")
244+
// Retry deploy to handle transient DNS/WireGuard connection failures
245+
require.EventuallyWithT(tt, func(c *assert.CollectT) {
246+
result := f.FlyAllowExitFailure("deploy --buildkit --remote-only --ha=false")
247+
if result.ExitCode() != 0 {
248+
stderr := result.StdErrString()
249+
// Retry on DNS lookup failures and WireGuard connection errors
250+
if strings.Contains(stderr, "no such host") || strings.Contains(stderr, "failed wireguard connection") {
251+
t.Logf("Transient network error detected, retrying... (error: %s)", stderr)
252+
assert.Fail(c, "transient network error, retrying...")
253+
} else {
254+
// Non-retryable error
255+
assert.Fail(c, fmt.Sprintf("deploy failed with unexpected error: %s", stderr))
256+
}
257+
} else {
258+
assert.True(c, true, "deploy succeeded")
259+
}
260+
}, 60*time.Second, 5*time.Second, "deploy should succeed after retrying transient network errors")
245261

246262
t.Logf("deploy %s again", appName)
247-
f.Fly("deploy --buildkit --remote-only --strategy immediate --ha=false")
263+
// Retry second deploy as well
264+
require.EventuallyWithT(tt, func(c *assert.CollectT) {
265+
result := f.FlyAllowExitFailure("deploy --buildkit --remote-only --strategy immediate --ha=false")
266+
if result.ExitCode() != 0 {
267+
stderr := result.StdErrString()
268+
if strings.Contains(stderr, "no such host") || strings.Contains(stderr, "failed wireguard connection") {
269+
t.Logf("Transient network error detected, retrying... (error: %s)", stderr)
270+
assert.Fail(c, "transient network error, retrying...")
271+
} else {
272+
assert.Fail(c, fmt.Sprintf("deploy failed with unexpected error: %s", stderr))
273+
}
274+
} else {
275+
assert.True(c, true, "deploy succeeded")
276+
}
277+
}, 60*time.Second, 5*time.Second, "deploy should succeed after retrying transient network errors")
248278

249279
body, err := testlib.RunHealthCheck(fmt.Sprintf("https://%s.fly.dev", appName))
250280
require.NoError(t, err)
@@ -273,10 +303,40 @@ func testDeployNodeAppWithBuildKitRemoteBuilder(tt *testing.T) {
273303
require.NoError(t, err)
274304

275305
t.Logf("deploy %s with BuildKit", appName)
276-
f.Fly("deploy --buildkit --remote-only --ha=false")
306+
// Retry deploy to handle transient DNS/WireGuard connection failures
307+
require.EventuallyWithT(tt, func(c *assert.CollectT) {
308+
result := f.FlyAllowExitFailure("deploy --buildkit --remote-only --ha=false")
309+
if result.ExitCode() != 0 {
310+
stderr := result.StdErrString()
311+
// Retry on DNS lookup failures and WireGuard connection errors
312+
if strings.Contains(stderr, "no such host") || strings.Contains(stderr, "failed wireguard connection") {
313+
t.Logf("Transient network error detected, retrying... (error: %s)", stderr)
314+
assert.Fail(c, "transient network error, retrying...")
315+
} else {
316+
// Non-retryable error
317+
assert.Fail(c, fmt.Sprintf("deploy failed with unexpected error: %s", stderr))
318+
}
319+
} else {
320+
assert.True(c, true, "deploy succeeded")
321+
}
322+
}, 60*time.Second, 5*time.Second, "deploy should succeed after retrying transient network errors")
277323

278324
t.Logf("deploy %s again with BuildKit", appName)
279-
f.Fly("deploy --buildkit --remote-only --strategy immediate --ha=false")
325+
// Retry second deploy as well
326+
require.EventuallyWithT(tt, func(c *assert.CollectT) {
327+
result := f.FlyAllowExitFailure("deploy --buildkit --remote-only --strategy immediate --ha=false")
328+
if result.ExitCode() != 0 {
329+
stderr := result.StdErrString()
330+
if strings.Contains(stderr, "no such host") || strings.Contains(stderr, "failed wireguard connection") {
331+
t.Logf("Transient network error detected, retrying... (error: %s)", stderr)
332+
assert.Fail(c, "transient network error, retrying...")
333+
} else {
334+
assert.Fail(c, fmt.Sprintf("deploy failed with unexpected error: %s", stderr))
335+
}
336+
} else {
337+
assert.True(c, true, "deploy succeeded")
338+
}
339+
}, 60*time.Second, 5*time.Second, "deploy should succeed after retrying transient network errors")
280340

281341
body, err := testlib.RunHealthCheck(fmt.Sprintf("https://%s.fly.dev", appName))
282342
require.NoError(t, err)

0 commit comments

Comments
 (0)