@@ -94,7 +94,7 @@ func TestFlyDeployHAPlacement(t *testing.T) {
9494 // The backend may not have replicated the app record to all hosts yet when
9595 // creating the second machine for HA, resulting in "sql: no rows in result set" errors
9696 var lastError string
97- require .EventuallyWithT (f , func (c * assert.CollectT ) {
97+ require .EventuallyWithT (t , func (c * assert.CollectT ) {
9898 result := f .FlyAllowExitFailure ("deploy --buildkit --remote-only" )
9999 if result .ExitCode () != 0 {
100100 stderr := result .StdErrString ()
@@ -241,10 +241,40 @@ func testDeployNodeAppWithRemoteBuilder(tt *testing.T) {
241241 require .NoError (t , err )
242242
243243 t .Logf ("deploy %s" , appName )
244- f .Fly ("deploy --buildkit --remote-only --ha=false" )
244+ // Retry deploy to handle transient DNS/WireGuard connection failures
245+ require .EventuallyWithT (tt , func (c * assert.CollectT ) {
246+ result := f .FlyAllowExitFailure ("deploy --buildkit --remote-only --ha=false" )
247+ if result .ExitCode () != 0 {
248+ stderr := result .StdErrString ()
249+ // Retry on DNS lookup failures and WireGuard connection errors
250+ if strings .Contains (stderr , "no such host" ) || strings .Contains (stderr , "failed wireguard connection" ) {
251+ t .Logf ("Transient network error detected, retrying... (error: %s)" , stderr )
252+ assert .Fail (c , "transient network error, retrying..." )
253+ } else {
254+ // Non-retryable error
255+ assert .Fail (c , fmt .Sprintf ("deploy failed with unexpected error: %s" , stderr ))
256+ }
257+ } else {
258+ assert .True (c , true , "deploy succeeded" )
259+ }
260+ }, 60 * time .Second , 5 * time .Second , "deploy should succeed after retrying transient network errors" )
245261
246262 t .Logf ("deploy %s again" , appName )
247- f .Fly ("deploy --buildkit --remote-only --strategy immediate --ha=false" )
263+ // Retry second deploy as well
264+ require .EventuallyWithT (tt , func (c * assert.CollectT ) {
265+ result := f .FlyAllowExitFailure ("deploy --buildkit --remote-only --strategy immediate --ha=false" )
266+ if result .ExitCode () != 0 {
267+ stderr := result .StdErrString ()
268+ if strings .Contains (stderr , "no such host" ) || strings .Contains (stderr , "failed wireguard connection" ) {
269+ t .Logf ("Transient network error detected, retrying... (error: %s)" , stderr )
270+ assert .Fail (c , "transient network error, retrying..." )
271+ } else {
272+ assert .Fail (c , fmt .Sprintf ("deploy failed with unexpected error: %s" , stderr ))
273+ }
274+ } else {
275+ assert .True (c , true , "deploy succeeded" )
276+ }
277+ }, 60 * time .Second , 5 * time .Second , "deploy should succeed after retrying transient network errors" )
248278
249279 body , err := testlib .RunHealthCheck (fmt .Sprintf ("https://%s.fly.dev" , appName ))
250280 require .NoError (t , err )
@@ -273,10 +303,40 @@ func testDeployNodeAppWithBuildKitRemoteBuilder(tt *testing.T) {
273303 require .NoError (t , err )
274304
275305 t .Logf ("deploy %s with BuildKit" , appName )
276- f .Fly ("deploy --buildkit --remote-only --ha=false" )
306+ // Retry deploy to handle transient DNS/WireGuard connection failures
307+ require .EventuallyWithT (tt , func (c * assert.CollectT ) {
308+ result := f .FlyAllowExitFailure ("deploy --buildkit --remote-only --ha=false" )
309+ if result .ExitCode () != 0 {
310+ stderr := result .StdErrString ()
311+ // Retry on DNS lookup failures and WireGuard connection errors
312+ if strings .Contains (stderr , "no such host" ) || strings .Contains (stderr , "failed wireguard connection" ) {
313+ t .Logf ("Transient network error detected, retrying... (error: %s)" , stderr )
314+ assert .Fail (c , "transient network error, retrying..." )
315+ } else {
316+ // Non-retryable error
317+ assert .Fail (c , fmt .Sprintf ("deploy failed with unexpected error: %s" , stderr ))
318+ }
319+ } else {
320+ assert .True (c , true , "deploy succeeded" )
321+ }
322+ }, 60 * time .Second , 5 * time .Second , "deploy should succeed after retrying transient network errors" )
277323
278324 t .Logf ("deploy %s again with BuildKit" , appName )
279- f .Fly ("deploy --buildkit --remote-only --strategy immediate --ha=false" )
325+ // Retry second deploy as well
326+ require .EventuallyWithT (tt , func (c * assert.CollectT ) {
327+ result := f .FlyAllowExitFailure ("deploy --buildkit --remote-only --strategy immediate --ha=false" )
328+ if result .ExitCode () != 0 {
329+ stderr := result .StdErrString ()
330+ if strings .Contains (stderr , "no such host" ) || strings .Contains (stderr , "failed wireguard connection" ) {
331+ t .Logf ("Transient network error detected, retrying... (error: %s)" , stderr )
332+ assert .Fail (c , "transient network error, retrying..." )
333+ } else {
334+ assert .Fail (c , fmt .Sprintf ("deploy failed with unexpected error: %s" , stderr ))
335+ }
336+ } else {
337+ assert .True (c , true , "deploy succeeded" )
338+ }
339+ }, 60 * time .Second , 5 * time .Second , "deploy should succeed after retrying transient network errors" )
280340
281341 body , err := testlib .RunHealthCheck (fmt .Sprintf ("https://%s.fly.dev" , appName ))
282342 require .NoError (t , err )
0 commit comments