Skip to content

Commit 178b0d6

Browse files
tmp: test TestRestoreTablesBatchRetryIntegration flakiness
1 parent a617bfe commit 178b0d6

File tree

1 file changed

+41
-39
lines changed

1 file changed

+41
-39
lines changed

pkg/service/restore/restore_integration_test.go

Lines changed: 41 additions & 39 deletions
Original file line numberDiff line numberDiff line change
@@ -782,49 +782,51 @@ func TestRestoreTablesBatchRetryIntegration(t *testing.T) {
782782
"restore_tables": true,
783783
}
784784

785-
t.Run("batch retry finished with success", func(t *testing.T) {
786-
Print("Inject errors to some download and las calls")
787-
downloadCnt := atomic.Int64{}
788-
lasCnt := atomic.Int64{}
789-
h.dstCluster.Hrt.SetInterceptor(httpx.RoundTripperFunc(func(req *http.Request) (*http.Response, error) {
790-
// For this setup, we have 6 remote sstable dirs and 6 workers.
791-
// We inject 2 errors during download and 3 errors during LAS.
792-
// This means that only a single node will be restoring at the end.
793-
// Huge batch size and 3 LAS errors guarantee total 9 calls to LAS.
794-
// The last failed call to LAS (cnt=8) waits a bit so that we test
795-
// that batch dispatcher correctly reuses and releases nodes waiting
796-
// for failed sstables to come back to the batch dispatcher.
797-
if strings.HasPrefix(req.URL.Path, "/agent/rclone/sync/copypaths") {
798-
if cnt := downloadCnt.Add(1); cnt == 1 || cnt == 3 {
799-
t.Log("Fake download error ", cnt)
800-
return nil, downloadErr
801-
}
802-
}
803-
if strings.HasPrefix(req.URL.Path, "/storage_service/sstables/") {
804-
cnt := lasCnt.Add(1)
805-
if cnt == 8 {
806-
time.Sleep(15 * time.Second)
785+
for i := range 20 {
786+
t.Run(fmt.Sprintf("%d: batch retry finished with success", i), func(t *testing.T) {
787+
Print("Inject errors to some download and las calls")
788+
downloadCnt := atomic.Int64{}
789+
lasCnt := atomic.Int64{}
790+
h.dstCluster.Hrt.SetInterceptor(httpx.RoundTripperFunc(func(req *http.Request) (*http.Response, error) {
791+
// For this setup, we have 6 remote sstable dirs and 6 workers.
792+
// We inject 2 errors during download and 3 errors during LAS.
793+
// This means that only a single node will be restoring at the end.
794+
// Huge batch size and 3 LAS errors guarantee total 9 calls to LAS.
795+
// The last failed call to LAS (cnt=8) waits a bit so that we test
796+
// that batch dispatcher correctly reuses and releases nodes waiting
797+
// for failed sstables to come back to the batch dispatcher.
798+
if strings.HasPrefix(req.URL.Path, "/agent/rclone/sync/copypaths") {
799+
if cnt := downloadCnt.Add(1); cnt == 1 || cnt == 3 {
800+
t.Log("Fake download error ", cnt)
801+
return nil, downloadErr
802+
}
807803
}
808-
if cnt == 1 || cnt == 5 || cnt == 8 {
809-
t.Log("Fake LAS error ", cnt)
810-
return nil, lasErr
804+
if strings.HasPrefix(req.URL.Path, "/storage_service/sstables/") {
805+
cnt := lasCnt.Add(1)
806+
if cnt == 8 {
807+
time.Sleep(15 * time.Second)
808+
}
809+
if cnt == 1 || cnt == 5 || cnt == 8 {
810+
t.Log("Fake LAS error ", cnt)
811+
return nil, lasErr
812+
}
811813
}
812-
}
813-
return nil, nil
814-
}))
814+
return nil, nil
815+
}))
815816

816-
Print("Run restore")
817-
grantRestoreTablesPermissions(t, h.dstCluster.rootSession, ksFilter, h.dstUser)
818-
h.runRestore(t, props)
817+
Print("Run restore")
818+
grantRestoreTablesPermissions(t, h.dstCluster.rootSession, ksFilter, h.dstUser)
819+
h.runRestore(t, props)
819820

820-
Print("Validate success")
821-
if cnt := lasCnt.Add(0); cnt < 9 {
822-
t.Fatalf("Expected at least 9 calls to LAS, got %d", cnt)
823-
}
824-
validateTableContent[int, int](t, h.srcCluster.rootSession, h.dstCluster.rootSession, ks, tab1, "id", "data")
825-
validateTableContent[int, int](t, h.srcCluster.rootSession, h.dstCluster.rootSession, ks, tab2, "id", "data")
826-
validateTableContent[int, int](t, h.srcCluster.rootSession, h.dstCluster.rootSession, ks, tab3, "id", "data")
827-
})
821+
Print("Validate success")
822+
if cnt := lasCnt.Add(0); cnt < 9 {
823+
t.Fatalf("Expected at least 9 calls to LAS, got %d", cnt)
824+
}
825+
validateTableContent[int, int](t, h.srcCluster.rootSession, h.dstCluster.rootSession, ks, tab1, "id", "data")
826+
validateTableContent[int, int](t, h.srcCluster.rootSession, h.dstCluster.rootSession, ks, tab2, "id", "data")
827+
validateTableContent[int, int](t, h.srcCluster.rootSession, h.dstCluster.rootSession, ks, tab3, "id", "data")
828+
})
829+
}
828830

829831
t.Run("restore with injected failures only", func(t *testing.T) {
830832
Print("Inject errors to all download and las calls")

0 commit comments

Comments
 (0)