Skip to content

Commit 54a1c93

Browse files
committed
Add retry logic for pg create to handle transient volume provisioning issues
1 parent 264005d commit 54a1c93

File tree

1 file changed

+219
-23
lines changed

1 file changed

+219
-23
lines changed

test/preflight/fly_postgres_test.go

Lines changed: 219 additions & 23 deletions
Original file line numberDiff line numberDiff line change
@@ -24,10 +24,33 @@ func TestPostgres_singleNode(t *testing.T) {
2424
t.Skip()
2525
}
2626

27-
f.Fly(
28-
"pg create --org %s --name %s --region %s --initial-cluster-size 1 --vm-size shared-cpu-1x --volume-size 1",
29-
f.OrgSlug(), appName, f.PrimaryRegion(),
30-
)
27+
// Retry pg create up to 3 times due to transient volume provisioning issues
28+
var pgCreateErr error
29+
for attempt := 1; attempt <= 3; attempt++ {
30+
result := f.FlyAllowExitFailure(
31+
"pg create --org %s --name %s --region %s --initial-cluster-size 1 --vm-size shared-cpu-1x --volume-size 1",
32+
f.OrgSlug(), appName, f.PrimaryRegion(),
33+
)
34+
35+
if result.ExitCode() == 0 {
36+
pgCreateErr = nil
37+
break
38+
}
39+
40+
pgCreateErr = fmt.Errorf("pg create failed (attempt %d/3): %s", attempt, result.StdErrString())
41+
42+
if strings.Contains(result.StdErrString(), "volume not found") && attempt < 3 {
43+
f.Logf("Volume provisioning failed (attempt %d/3), retrying...", attempt)
44+
f.FlyAllowExitFailure("apps destroy %s --yes", appName)
45+
time.Sleep(5 * time.Second)
46+
} else if attempt < 3 {
47+
f.Logf("pg create failed (attempt %d/3): %v, retrying...", attempt, result.StdErrString())
48+
time.Sleep(2 * time.Second)
49+
}
50+
}
51+
52+
require.NoError(f, pgCreateErr, "pg create failed after 3 attempts")
53+
3154
f.Fly("status -a %s", appName)
3255
f.Fly("config save -a %s", appName)
3356
f.Fly("config validate")
@@ -44,10 +67,33 @@ func TestPostgres_autostart(t *testing.T) {
4467

4568
appName := f.CreateRandomAppName()
4669

47-
f.Fly(
48-
"pg create --org %s --name %s --region %s --initial-cluster-size 1 --vm-size %s --volume-size 1",
49-
f.OrgSlug(), appName, f.PrimaryRegion(), postgresMachineSize,
50-
)
70+
// Retry pg create up to 3 times due to transient volume provisioning issues
71+
var pgCreateErr error
72+
for attempt := 1; attempt <= 3; attempt++ {
73+
result := f.FlyAllowExitFailure(
74+
"pg create --org %s --name %s --region %s --initial-cluster-size 1 --vm-size %s --volume-size 1",
75+
f.OrgSlug(), appName, f.PrimaryRegion(), postgresMachineSize,
76+
)
77+
78+
if result.ExitCode() == 0 {
79+
pgCreateErr = nil
80+
break
81+
}
82+
83+
pgCreateErr = fmt.Errorf("pg create failed (attempt %d/3): %s", attempt, result.StdErrString())
84+
85+
if strings.Contains(result.StdErrString(), "volume not found") && attempt < 3 {
86+
f.Logf("Volume provisioning failed (attempt %d/3), retrying...", attempt)
87+
f.FlyAllowExitFailure("apps destroy %s --yes", appName)
88+
time.Sleep(5 * time.Second)
89+
} else if attempt < 3 {
90+
f.Logf("pg create failed (attempt %d/3): %v, retrying...", attempt, result.StdErrString())
91+
time.Sleep(2 * time.Second)
92+
}
93+
}
94+
95+
require.NoError(f, pgCreateErr, "pg create failed after 3 attempts")
96+
5197
machList := f.MachinesList(appName)
5298
require.Equal(t, 1, len(machList), "expected exactly 1 machine after launch")
5399
firstMachine := machList[0]
@@ -58,7 +104,34 @@ func TestPostgres_autostart(t *testing.T) {
58104
}
59105

60106
appName = f.CreateRandomAppName()
61-
f.Fly("pg create --org %s --name %s --region %s --initial-cluster-size 1 --vm-size shared-cpu-1x --volume-size 1 --autostart", f.OrgSlug(), appName, f.PrimaryRegion())
107+
108+
// Retry second pg create
109+
pgCreateErr = nil
110+
for attempt := 1; attempt <= 3; attempt++ {
111+
result := f.FlyAllowExitFailure(
112+
"pg create --org %s --name %s --region %s --initial-cluster-size 1 --vm-size shared-cpu-1x --volume-size 1 --autostart",
113+
f.OrgSlug(), appName, f.PrimaryRegion(),
114+
)
115+
116+
if result.ExitCode() == 0 {
117+
pgCreateErr = nil
118+
break
119+
}
120+
121+
pgCreateErr = fmt.Errorf("pg create failed (attempt %d/3): %s", attempt, result.StdErrString())
122+
123+
if strings.Contains(result.StdErrString(), "volume not found") && attempt < 3 {
124+
f.Logf("Volume provisioning failed (attempt %d/3), retrying...", attempt)
125+
f.FlyAllowExitFailure("apps destroy %s --yes", appName)
126+
time.Sleep(5 * time.Second)
127+
} else if attempt < 3 {
128+
f.Logf("pg create failed (attempt %d/3): %v, retrying...", attempt, result.StdErrString())
129+
time.Sleep(2 * time.Second)
130+
}
131+
}
132+
133+
require.NoError(f, pgCreateErr, "pg create failed after 3 attempts")
134+
62135
machList = f.MachinesList(appName)
63136
require.Equal(t, 1, len(machList), "expected exactly 1 machine after launch")
64137
firstMachine = machList[0]
@@ -95,7 +168,33 @@ func TestPostgres_FlexFailover(t *testing.T) {
95168
return ""
96169
}
97170

98-
f.Fly("pg create --flex --org %s --name %s --region %s --initial-cluster-size 3 --vm-size shared-cpu-1x --volume-size 1", f.OrgSlug(), appName, f.PrimaryRegion())
171+
// Retry pg create up to 3 times due to transient volume provisioning issues
172+
var pgCreateErr error
173+
for attempt := 1; attempt <= 3; attempt++ {
174+
result := f.FlyAllowExitFailure(
175+
"pg create --flex --org %s --name %s --region %s --initial-cluster-size 3 --vm-size shared-cpu-1x --volume-size 1",
176+
f.OrgSlug(), appName, f.PrimaryRegion(),
177+
)
178+
179+
if result.ExitCode() == 0 {
180+
pgCreateErr = nil
181+
break
182+
}
183+
184+
pgCreateErr = fmt.Errorf("pg create failed (attempt %d/3): %s", attempt, result.StdErrString())
185+
186+
if strings.Contains(result.StdErrString(), "volume not found") && attempt < 3 {
187+
f.Logf("Volume provisioning failed (attempt %d/3), retrying...", attempt)
188+
f.FlyAllowExitFailure("apps destroy %s --yes", appName)
189+
time.Sleep(5 * time.Second)
190+
} else if attempt < 3 {
191+
f.Logf("pg create failed (attempt %d/3): %v, retrying...", attempt, result.StdErrString())
192+
time.Sleep(2 * time.Second)
193+
}
194+
}
195+
196+
require.NoError(f, pgCreateErr, "pg create failed after 3 attempts")
197+
99198
machList := f.MachinesList(appName)
100199
leaderMachineID := findLeaderID(machList)
101200
if leaderMachineID == "" {
@@ -119,7 +218,37 @@ func TestPostgres_NoMachines(t *testing.T) {
119218

120219
appName := f.CreateRandomAppName()
121220

122-
f.Fly("pg create --org %s --name %s --region %s --initial-cluster-size 1 --vm-size shared-cpu-1x --volume-size 1", f.OrgSlug(), appName, f.PrimaryRegion())
221+
// Retry pg create up to 3 times due to transient volume provisioning issues
222+
var pgCreateErr error
223+
for attempt := 1; attempt <= 3; attempt++ {
224+
result := f.FlyAllowExitFailure(
225+
"pg create --org %s --name %s --region %s --initial-cluster-size 1 --vm-size shared-cpu-1x --volume-size 1",
226+
f.OrgSlug(), appName, f.PrimaryRegion(),
227+
)
228+
229+
if result.ExitCode() == 0 {
230+
// Success!
231+
pgCreateErr = nil
232+
break
233+
}
234+
235+
pgCreateErr = fmt.Errorf("pg create failed (attempt %d/3): %s", attempt, result.StdErrString())
236+
237+
// If this was a volume-related error and we have retries left, clean up and retry
238+
if strings.Contains(result.StdErrString(), "volume not found") && attempt < 3 {
239+
f.Logf("Volume provisioning failed (attempt %d/3), retrying...", attempt)
240+
// Clean up the partially created app before retrying
241+
f.FlyAllowExitFailure("apps destroy %s --yes", appName)
242+
time.Sleep(5 * time.Second) // Give the platform time to clean up
243+
} else if attempt < 3 {
244+
// Other error, still retry but don't clean up
245+
f.Logf("pg create failed (attempt %d/3): %v, retrying...", attempt, result.StdErrString())
246+
time.Sleep(2 * time.Second)
247+
}
248+
}
249+
250+
require.NoError(f, pgCreateErr, "pg create failed after 3 attempts")
251+
123252
machList := f.MachinesList(appName)
124253
require.Equal(t, 1, len(machList), "expected exactly 1 machine after launch")
125254
firstMachine := machList[0]
@@ -213,14 +342,59 @@ func TestPostgres_ImportSuccess(t *testing.T) {
213342
firstAppName := f.CreateRandomAppName()
214343
secondAppName := f.CreateRandomAppName()
215344

216-
f.Fly(
217-
"pg create --org %s --name %s --region %s --initial-cluster-size 1 --vm-size %s --volume-size 1 --password x",
218-
f.OrgSlug(), firstAppName, f.PrimaryRegion(), postgresMachineSize,
219-
)
220-
f.Fly(
221-
"pg create --org %s --name %s --region %s --initial-cluster-size 1 --vm-size %s --volume-size 1",
222-
f.OrgSlug(), secondAppName, f.PrimaryRegion(), postgresMachineSize,
223-
)
345+
// Retry first pg create up to 3 times due to transient volume provisioning issues
346+
var pgCreateErr error
347+
for attempt := 1; attempt <= 3; attempt++ {
348+
result := f.FlyAllowExitFailure(
349+
"pg create --org %s --name %s --region %s --initial-cluster-size 1 --vm-size %s --volume-size 1 --password x",
350+
f.OrgSlug(), firstAppName, f.PrimaryRegion(), postgresMachineSize,
351+
)
352+
353+
if result.ExitCode() == 0 {
354+
pgCreateErr = nil
355+
break
356+
}
357+
358+
pgCreateErr = fmt.Errorf("pg create failed (attempt %d/3): %s", attempt, result.StdErrString())
359+
360+
if strings.Contains(result.StdErrString(), "volume not found") && attempt < 3 {
361+
f.Logf("Volume provisioning failed (attempt %d/3), retrying...", attempt)
362+
f.FlyAllowExitFailure("apps destroy %s --yes", firstAppName)
363+
time.Sleep(5 * time.Second)
364+
} else if attempt < 3 {
365+
f.Logf("pg create failed (attempt %d/3): %v, retrying...", attempt, result.StdErrString())
366+
time.Sleep(2 * time.Second)
367+
}
368+
}
369+
370+
require.NoError(f, pgCreateErr, "pg create failed after 3 attempts")
371+
372+
// Retry second pg create
373+
pgCreateErr = nil
374+
for attempt := 1; attempt <= 3; attempt++ {
375+
result := f.FlyAllowExitFailure(
376+
"pg create --org %s --name %s --region %s --initial-cluster-size 1 --vm-size %s --volume-size 1",
377+
f.OrgSlug(), secondAppName, f.PrimaryRegion(), postgresMachineSize,
378+
)
379+
380+
if result.ExitCode() == 0 {
381+
pgCreateErr = nil
382+
break
383+
}
384+
385+
pgCreateErr = fmt.Errorf("pg create failed (attempt %d/3): %s", attempt, result.StdErrString())
386+
387+
if strings.Contains(result.StdErrString(), "volume not found") && attempt < 3 {
388+
f.Logf("Volume provisioning failed (attempt %d/3), retrying...", attempt)
389+
f.FlyAllowExitFailure("apps destroy %s --yes", secondAppName)
390+
time.Sleep(5 * time.Second)
391+
} else if attempt < 3 {
392+
f.Logf("pg create failed (attempt %d/3): %v, retrying...", attempt, result.StdErrString())
393+
time.Sleep(2 * time.Second)
394+
}
395+
}
396+
397+
require.NoError(f, pgCreateErr, "pg create failed after 3 attempts")
224398
assert.EventuallyWithT(t, func(t *assert.CollectT) {
225399
assertPostgresIsUp(t, f, firstAppName)
226400
}, 1*time.Minute, 10*time.Second)
@@ -265,10 +439,32 @@ func TestPostgres_ImportFailure(t *testing.T) {
265439

266440
appName := f.CreateRandomAppName()
267441

268-
f.Fly(
269-
"pg create --org %s --name %s --region %s --initial-cluster-size 1 --vm-size %s --volume-size 1 --password x",
270-
f.OrgSlug(), appName, f.PrimaryRegion(), postgresMachineSize,
271-
)
442+
// Retry pg create up to 3 times due to transient volume provisioning issues
443+
var pgCreateErr error
444+
for attempt := 1; attempt <= 3; attempt++ {
445+
result := f.FlyAllowExitFailure(
446+
"pg create --org %s --name %s --region %s --initial-cluster-size 1 --vm-size %s --volume-size 1 --password x",
447+
f.OrgSlug(), appName, f.PrimaryRegion(), postgresMachineSize,
448+
)
449+
450+
if result.ExitCode() == 0 {
451+
pgCreateErr = nil
452+
break
453+
}
454+
455+
pgCreateErr = fmt.Errorf("pg create failed (attempt %d/3): %s", attempt, result.StdErrString())
456+
457+
if strings.Contains(result.StdErrString(), "volume not found") && attempt < 3 {
458+
f.Logf("Volume provisioning failed (attempt %d/3), retrying...", attempt)
459+
f.FlyAllowExitFailure("apps destroy %s --yes", appName)
460+
time.Sleep(5 * time.Second)
461+
} else if attempt < 3 {
462+
f.Logf("pg create failed (attempt %d/3): %v, retrying...", attempt, result.StdErrString())
463+
time.Sleep(2 * time.Second)
464+
}
465+
}
466+
467+
require.NoError(f, pgCreateErr, "pg create failed after 3 attempts")
272468
assert.EventuallyWithT(t, func(t *assert.CollectT) {
273469
assertPostgresIsUp(t, f, appName)
274470
}, 1*time.Minute, 10*time.Second)

0 commit comments

Comments
 (0)