Skip to content

Commit ec9e631

Browse files
feat(repair): tablet repair API filtering (#4301)
* Revert "Tablet repair API (#4274)" This reverts commit 76b118d. * feat(scyllaclient): add methods for managing tablet repair API * feat(repair): use tablet repair API Scylla 2025.1.0 is introducing new tablet repair API. SM should use it for repairing tablet tables with a single API call. Fixes #4188 * feat(repair): don't disable tablet load balancing from Scylla 2025.1.0 Scylla 2025.1.0 is introducing new tablet repair API which does not require stopping tablet load balancing during the repair. Fixes #4273 * feat(repair): add validation for tablet repair API filtering SM repair task supports 3 types of host filtering: * --dc - controlled by API dc filter * --ignore-down-nodes - controlled by API host filter * --host - does not make sense for tablet table This commit adds validation for --ignores-down-nodes and --host configurations. Fixes #4292 * chore(repair_test): update repair API parsing with tablet repair * refactor(repair_test): move node stopping/starting to helper methods * fix(repair_test): don't use repair interceptor by default This might result in hiding some issues, so we better use it only when it's directly specified by the test. * feat(repair_test): add tests specific to tablet repair API * fix(repair_test): adjust tests to tablet repair API * chore(.github): test against latest Scylla It already contains the tablet repair API filtering features, so we should start testing with it and consider moving to more stable build later on. Unfortunately, this requires fixing #4303. Ref #4303 * fix(repair): adjust comments * fix(repair): fix tablet load balancing handling in separate method This commit fixes a bug discovered by: #4301 (comment) SM was handling tablet load balancing for repair completely wrong for cluster with both vnode and tablet keyspaces. In such cases, it was enabling tablet load balancing when repairing tablet keyspaces, and disabling it when it was repairing vnode keyspaces. * feat(repair_test): extend repair API test for previous Scylla versions * refactor(repair_test): add tabletRepairSupport helper function
1 parent fb09d66 commit ec9e631

13 files changed

+380
-186
lines changed

.github/cfg/integration-test-cfg.yaml

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -46,13 +46,13 @@
4646
tablets: enabled
4747
ssl-enabled: false
4848

49-
- scylla-version: scylla:2025.1.0-rc3
49+
- scylla-version: scylladb-ci:2025.2.0-dev-0.20250310.8d676048a6d9
5050
ip-family: IPV4
5151
raft-schema: none
5252
tablets: disabled
5353
ssl-enabled: true
5454

55-
- scylla-version: scylla:2025.1.0-rc3
55+
- scylla-version: scylladb-ci:2025.2.0-dev-0.20250310.8d676048a6d9
5656
ip-family: IPV4
5757
raft-schema: none
5858
tablets: enabled

.github/workflows/integration-tests-2025.1.0-rc3-IPV4-tablets-nossl.yaml renamed to .github/workflows/integration-tests-2025.2.0-dev-0.20250310.8d676048a6d9-IPV4-tablets-nossl.yaml

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -2,7 +2,7 @@ concurrency:
22
cancel-in-progress: true
33
group: int-${{ github.workflow }}-${{ github.ref }}
44
env:
5-
scylla-version: scylla:2025.1.0-rc3
5+
scylla-version: scylladb-ci:2025.2.0-dev-0.20250310.8d676048a6d9
66
ip-family: IPV4
77
raft-schema: none
88
tablets: enabled
@@ -106,7 +106,7 @@ jobs:
106106
run: make pkg-integration-test PKG=./pkg/store
107107
- name: Run migrate tests
108108
run: make pkg-integration-test PKG=./pkg/schema/migrate
109-
name: integration-tests-2025.1.0-rc3-IPV4-tablets-nossl
109+
name: integration-tests-2025.2.0-dev-0.20250310.8d676048a6d9-IPV4-tablets-nossl
110110
"on":
111111
pull_request:
112112
types:

.github/workflows/integration-tests-2025.1.0-rc3-IPV4.yaml renamed to .github/workflows/integration-tests-2025.2.0-dev-0.20250310.8d676048a6d9-IPV4.yaml

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -2,7 +2,7 @@ concurrency:
22
cancel-in-progress: true
33
group: int-${{ github.workflow }}-${{ github.ref }}
44
env:
5-
scylla-version: scylla:2025.1.0-rc3
5+
scylla-version: scylladb-ci:2025.2.0-dev-0.20250310.8d676048a6d9
66
ip-family: IPV4
77
raft-schema: none
88
tablets: disabled
@@ -106,7 +106,7 @@ jobs:
106106
run: make pkg-integration-test PKG=./pkg/store
107107
- name: Run migrate tests
108108
run: make pkg-integration-test PKG=./pkg/schema/migrate
109-
name: integration-tests-2025.1.0-rc3-IPV4
109+
name: integration-tests-2025.2.0-dev-0.20250310.8d676048a6d9-IPV4
110110
"on":
111111
pull_request:
112112
types:

README.md

Lines changed: 8 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -15,12 +15,12 @@ Scylla Manager consists of tree components:
1515

1616
## Scylla integration status
1717

18-
| ScyllaDB version | Workflows | Limitations |
19-
|------------------|-------------------------------------------------------------------------------------------------------------------------------------------|-------------------------------------------------------------------------------------------------------------------------------------------------------|
20-
| **2024.1.12** | ![integration-tests-2024.1.12-IPV4]<br/>![integration-tests-2024.1.12-IPV6] | Restoration of schema into cluster with `consistant_cluster_management: true` is not supported |
21-
| **2023.1.11** | ![integration-tests-2023.1.11-IPV4]<br/>![integration-tests-2023.1.11-IPV4-raftschema]<br/>![integration-tests-2023.1.11-IPV6-raftschema] | Restoration of schema into cluster with `consistant_cluster_management: true` is not supported |
22-
| **6.2.0** | ![integration-tests-6.2.0-IPV4]<br/>![integration-tests-6.2.0-IPV4-tablets]<br/>![integration-tests-6.2.0-IPV6-tablets-nossl] | Restoration of **Authentication** and **Service Levels** is not supported<br/>Restoration of schema containing **Alternator** tables is not supported |
23-
| **2025.1.0-rc3** | ![integration-tests-2025.1.0-rc3-IPV4]<br/>![integration-tests-2025.1.0-rc3-IPV4-tablets-nossl] | Restoration of **Authentication** and **Service Levels** is not supported<br/>Restoration of schema containing **Alternator** tables is not supported |
18+
| ScyllaDB version | Workflows | Limitations |
19+
|------------------------------------------|-------------------------------------------------------------------------------------------------------------------------------------------------|-------------------------------------------------------------------------------------------------------------------------------------------------------|
20+
| **2024.1.12** | ![integration-tests-2024.1.12-IPV4]<br/>![integration-tests-2024.1.12-IPV6] | Restoration of schema into cluster with `consistant_cluster_management: true` is not supported |
21+
| **2023.1.11** | ![integration-tests-2023.1.11-IPV4]<br/>![integration-tests-2023.1.11-IPV4-raftschema]<br/>![integration-tests-2023.1.11-IPV6-raftschema] | Restoration of schema into cluster with `consistant_cluster_management: true` is not supported |
22+
| **6.2.0** | ![integration-tests-6.2.0-IPV4]<br/>![integration-tests-6.2.0-IPV4-tablets]<br/>![integration-tests-6.2.0-IPV6-tablets-nossl] | Restoration of **Authentication** and **Service Levels** is not supported<br/>Restoration of schema containing **Alternator** tables is not supported |
23+
| **2025.2.0-dev-0.20250310.8d676048a6d9** | ![integration-tests-2025.2.0-dev-0.20250310.8d676048a6d9-IPV4]<br/>![integration-tests-2025.2.0-dev-0.20250310.8d676048a6d9-IPV4-tablets-nossl] | Restoration of **Authentication** and **Service Levels** is not supported<br/>Restoration of schema containing **Alternator** tables is not supported |
2424

2525
[integration-tests-2024.1.12-IPV4]: https://github.com/scylladb/scylla-manager/actions/workflows/integration-tests-2024.1.12-IPV4.yaml/badge.svg?branch=master
2626
[integration-tests-2024.1.12-IPV6]: https://github.com/scylladb/scylla-manager/actions/workflows/integration-tests-2024.1.12-IPV6.yaml/badge.svg?branch=master
@@ -30,8 +30,8 @@ Scylla Manager consists of tree components:
3030
[integration-tests-6.2.0-IPV4]: https://github.com/scylladb/scylla-manager/actions/workflows/integration-tests-6.2.0-IPV4.yaml/badge.svg?branch=master
3131
[integration-tests-6.2.0-IPV4-tablets]: https://github.com/scylladb/scylla-manager/actions/workflows/integration-tests-6.2.0-IPV4-tablets.yaml/badge.svg?branch=master
3232
[integration-tests-6.2.0-IPV6-tablets-nossl]: https://github.com/scylladb/scylla-manager/actions/workflows/integration-tests-6.2.0-IPV6-tablets-nossl.yaml/badge.svg?branch=master
33-
[integration-tests-2025.1.0-rc3-IPV4]: https://github.com/scylladb/scylla-manager/actions/workflows/integration-tests-2025.1.0-rc3-IPV4.yaml/badge.svg?branch=master
34-
[integration-tests-2025.1.0-rc3-IPV4-tablets-nossl]: https://github.com/scylladb/scylla-manager/actions/workflows/integration-tests-2025.1.0-rc3-IPV4-tablets-nossl.yaml/badge.svg?branch=master
33+
[integration-tests-2025.2.0-dev-0.20250310.8d676048a6d9-IPV4]: https://github.com/scylladb/scylla-manager/actions/workflows/integration-tests-2025.2.0-dev-0.20250310.8d676048a6d9-IPV4.yaml/badge.svg?branch=master
34+
[integration-tests-2025.2.0-dev-0.20250310.8d676048a6d9-IPV4-tablets-nossl]: https://github.com/scylladb/scylla-manager/actions/workflows/integration-tests-2025.2.0-dev-0.20250310.8d676048a6d9-IPV4-tablets-nossl.yaml/badge.svg?branch=master
3535

3636
## Installing and updating Go
3737

pkg/scyllaclient/client_agent.go

Lines changed: 2 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -214,9 +214,8 @@ func (ni *NodeInfo) SupportsRepairSmallTableOptimization() (bool, error) {
214214
return supports, nil
215215
}
216216

217-
// SupportsTabletRepairNoHostFiltering returns true if /storage_service/tablets/repair API is exposed.
218-
// It might not necessarily allow for host filtering.
219-
func (ni *NodeInfo) SupportsTabletRepairNoHostFiltering() (bool, error) {
217+
// SupportsTabletRepair returns true if /storage_service/tablets/repair API is exposed.
218+
func (ni *NodeInfo) SupportsTabletRepair() (bool, error) {
220219
// Detect master builds
221220
if scyllaversion.MasterVersion(ni.ScyllaVersion) {
222221
return true, nil

pkg/scyllaclient/client_scylla.go

Lines changed: 13 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -592,19 +592,27 @@ func ReplicaHash(replicaSet []string) uint64 {
592592
}
593593

594594
// TabletRepair schedules Scylla repair tablet table task and returns its ID.
595-
// The whole table will be repaired on all nodes with just a single task.
596-
// The host is only needed so that we know which node should be queried
597-
// for the task status.
598-
func (c *Client) TabletRepair(ctx context.Context, keyspace, table, host string) (string, error) {
595+
// All tablets will be repaired with just a single task. It repairs all hosts
596+
// by default, but it's possible to filter them by DC or host ID. The master is
597+
// only needed so that we know which node should be queried for the task status.
598+
func (c *Client) TabletRepair(ctx context.Context, keyspace, table, master string, dcs, hostIDs []string) (string, error) {
599599
const allTablets = "all"
600600
dontAwaitCompletion := "false"
601601
p := operations.StorageServiceTabletsRepairPostParams{
602-
Context: forceHost(ctx, host),
602+
Context: forceHost(ctx, master),
603603
Ks: keyspace,
604604
Table: table,
605605
Tokens: allTablets,
606606
AwaitCompletion: &dontAwaitCompletion,
607607
}
608+
if len(dcs) > 0 {
609+
merged := strings.Join(dcs, ",")
610+
p.SetDcsFilter(&merged)
611+
}
612+
if len(hostIDs) > 0 {
613+
merged := strings.Join(hostIDs, ",")
614+
p.SetHostsFilter(&merged)
615+
}
608616
resp, err := c.scyllaOps.StorageServiceTabletsRepairPost(&p)
609617
if err != nil {
610618
return "", err

pkg/service/repair/generator.go

Lines changed: 33 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -126,21 +126,13 @@ func (g *generator) Run(ctx context.Context) (err error) {
126126
var genErr error
127127

128128
defer func() {
129-
// Always leave tablet migration enabled after repair
130-
tabletBalancingErr := g.ringDescriber.ControlTabletLoadBalancing(context.Background(), true)
131-
err = stdErrors.Join(err, errors.Wrap(tabletBalancingErr, "control post repair tablet load balancing"))
129+
balancingErr := g.handleTabletLoadBalancing(context.Background(), "")
130+
err = stdErrors.Join(err, errors.Wrap(balancingErr, "control post repair tablet load balancing"))
132131
}()
133132

134133
for _, ksp := range g.plan.Keyspaces {
135-
if !g.plan.apiSupport.tabletRepairNoHostFiltering {
136-
// Disable tablet migration when repairing tablet table.
137-
// Without that it could be possible that some tablet "escapes" being
138-
// repaired by migrating from not yet repaired token range to already repaired one.
139-
// We don't need to do it when tablet repair API is available
140-
// (even if it does not allow for host filtering).
141-
if err := g.ringDescriber.ControlTabletLoadBalancing(ctx, g.ringDescriber.IsTabletKeyspace(ksp.Keyspace)); err != nil {
142-
return errors.Wrapf(err, "control tablet load balancing")
143-
}
134+
if err := g.handleTabletLoadBalancing(ctx, ksp.Keyspace); err != nil {
135+
return errors.Wrapf(err, "control tablet load balancing")
144136
}
145137

146138
for _, tp := range ksp.Tables {
@@ -167,6 +159,34 @@ func (g *generator) Run(ctx context.Context) (err error) {
167159
return errors.Wrap(genErr, "see more errors in logs")
168160
}
169161

162+
// handleTabletLoadBalancing, if needed, controls tablet load
163+
// balancing, so that the provided keyspace can be safely repaired.
164+
// It must be called before repairing any keyspace.
165+
// It must also be called after finishing all repairs,
166+
// but this time with an empty keyspace argument.
167+
func (g *generator) handleTabletLoadBalancing(ctx context.Context, keyspace string) error {
168+
// No need to disable tablet load balancing since
169+
// tablet repair API handles it correctly.
170+
if g.plan.apiSupport.tabletRepair {
171+
return nil
172+
}
173+
// Re-enable tablet load balancing after the repair if finished.
174+
if keyspace == "" {
175+
return g.ringDescriber.ControlTabletLoadBalancing(ctx, true)
176+
}
177+
// Need to disable tablet load balancing when repairing tablet
178+
// keyspace with old repair API since it does not respect tablet migrations.
179+
// Not disabling tablet migration would result in skipping repair of tablets
180+
// which migrated from not yet repaired token range to already repaired one.
181+
// It wouldn't result in data resurrection, as tablets have tombstone_gc
182+
// mode set to 'repair'.
183+
if g.ringDescriber.IsTabletKeyspace(keyspace) {
184+
return g.ringDescriber.ControlTabletLoadBalancing(ctx, false)
185+
}
186+
// No need to disable tablet load balancing for vnode keyspace.
187+
return g.ringDescriber.ControlTabletLoadBalancing(ctx, true)
188+
}
189+
170190
func (g *generator) newTableGenerator(keyspace string, tp tablePlan, ring scyllaclient.Ring) *tableGenerator {
171191
todoRanges := make(map[scyllaclient.TokenRange]struct{})
172192
for _, rt := range ring.ReplicaTokens {
@@ -187,7 +207,7 @@ func (g *generator) newTableGenerator(keyspace string, tp tablePlan, ring scylla
187207

188208
var jt jobType
189209
switch {
190-
case tabletKs && tp.FullRepair && g.plan.apiSupport.tabletRepairNoHostFiltering:
210+
case tabletKs && g.plan.apiSupport.tabletRepair:
191211
jt = tabletJobType
192212
case g.plan.apiSupport.smallTableRepair && tp.Small && !tabletKs:
193213
jt = smallTableJobType

pkg/service/repair/helper_integration_test.go

Lines changed: 24 additions & 17 deletions
Original file line numberDiff line numberDiff line change
@@ -24,7 +24,6 @@ import (
2424

2525
"github.com/scylladb/go-log"
2626
"github.com/scylladb/gocqlx/v2"
27-
"github.com/scylladb/scylla-manager/v3/pkg/dht"
2827
"github.com/scylladb/scylla-manager/v3/pkg/scyllaclient"
2928
. "github.com/scylladb/scylla-manager/v3/pkg/testutils"
3029
. "github.com/scylladb/scylla-manager/v3/pkg/testutils/db"
@@ -39,6 +38,16 @@ import (
3938
// like Scylla version or tablets.
4039
var globalNodeInfo *scyllaclient.NodeInfo
4140

41+
func tabletRepairSupport(t *testing.T) bool {
42+
t.Helper()
43+
44+
ok, err := globalNodeInfo.SupportsTabletRepair()
45+
if err != nil {
46+
t.Fatal(err)
47+
}
48+
return ok
49+
}
50+
4251
// Used to fill globalNodeInfo before running the tests.
4352
func TestMain(m *testing.M) {
4453
if !flag.Parsed() {
@@ -96,15 +105,18 @@ func dropKeyspace(t *testing.T, session gocqlx.Session, keyspace string) {
96105

97106
type repairReq struct {
98107
// always set
99-
host string
100-
keyspace string
101-
table string
108+
host string
109+
keyspace string
110+
table string
111+
// always set for vnode
102112
replicaSet []string
103113
ranges []scyllaclient.TokenRange
104-
105-
// optional
106-
SmallTableOptimization bool
107-
RangesParallelism int
114+
// optional for vnode
115+
smallTableOptimization bool
116+
rangesParallelism int
117+
// optional for tablet
118+
dcFilter []string
119+
hostFilter []string
108120
}
109121

110122
func (r repairReq) fullTable() string {
@@ -198,15 +210,15 @@ func parseRepairAsyncReq(t *testing.T, req *http.Request) repairReq {
198210
table: req.URL.Query().Get("columnFamilies"),
199211
replicaSet: strings.Split(req.URL.Query().Get("hosts"), ","),
200212
ranges: parseRanges(t, req.URL.Query().Get("ranges")),
201-
SmallTableOptimization: req.URL.Query().Get("small_table_optimization") == "true",
213+
smallTableOptimization: req.URL.Query().Get("small_table_optimization") == "true",
202214
}
203215
if rawRangesParallelism := req.URL.Query().Get("ranges_parallelism"); rawRangesParallelism != "" {
204216
rangesParallelism, err := strconv.Atoi(rawRangesParallelism)
205217
if err != nil {
206218
t.Error(err)
207219
return repairReq{}
208220
}
209-
sched.RangesParallelism = rangesParallelism
221+
sched.rangesParallelism = rangesParallelism
210222
}
211223
if sched.keyspace == "" || sched.table == "" || len(sched.replicaSet) == 0 {
212224
t.Error("Not fully initialized old repair sched req")
@@ -226,13 +238,8 @@ func parseTabletRepairReq(t *testing.T, req *http.Request) repairReq {
226238
host: req.Host,
227239
keyspace: req.URL.Query().Get("ks"),
228240
table: req.URL.Query().Get("table"),
229-
replicaSet: ManagedClusterHosts(),
230-
ranges: []scyllaclient.TokenRange{
231-
{
232-
StartToken: dht.Murmur3MinToken,
233-
EndToken: dht.Murmur3MaxToken,
234-
},
235-
},
241+
dcFilter: strings.Split(req.URL.Query().Get("dcs_filter"), ","),
242+
hostFilter: strings.Split(req.URL.Query().Get("hosts_filter"), ","),
236243
}
237244
if sched.keyspace == "" || sched.table == "" {
238245
t.Error("Not fully initialized tablet repair sched req")

pkg/service/repair/model.go

Lines changed: 4 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -31,9 +31,10 @@ func NewIntensityFromDeprecated(i float64) Intensity {
3131

3232
// Target specifies what shall be repaired.
3333
type Target struct {
34-
Units []Unit `json:"units"`
35-
DC []string `json:"dc"`
36-
Host string `json:"host,omitempty"`
34+
Units []Unit `json:"units"`
35+
DC []string `json:"dc"`
36+
Host string `json:"host,omitempty"`
37+
// Down hosts excluded from repair by the --ignore-down-hosts flag.
3738
IgnoreHosts []string `json:"ignore_hosts,omitempty"`
3839
FailFast bool `json:"fail_fast"`
3940
Continue bool `json:"continue"`

pkg/service/repair/plan.go

Lines changed: 6 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -33,6 +33,7 @@ type keyspacePlans []keyspacePlan
3333
type keyspacePlan struct {
3434
Keyspace string
3535
Size int64
36+
Tablet bool
3637
Tables []tablePlan
3738
}
3839

@@ -43,7 +44,6 @@ type tablePlan struct {
4344
RangesCnt int
4445
ReplicaSetCnt int
4546
Small bool
46-
FullRepair bool // Is an entire table is going to be repaired
4747
}
4848

4949
type tableStats struct {
@@ -83,12 +83,8 @@ func newPlan(ctx context.Context, target Target, client *scyllaclient.Client) (*
8383
// Update ranges and hosts
8484
rangesCnt := 0
8585
replicaSetCnt := 0
86-
fullRepair := true
8786
for _, rep := range ring.ReplicaTokens {
8887
filtered := filterReplicaSet(rep.ReplicaSet, ring.HostDC, target)
89-
if len(filtered) != len(rep.ReplicaSet) {
90-
fullRepair = false
91-
}
9288
if len(filtered) == 0 {
9389
continue
9490
}
@@ -106,13 +102,13 @@ func newPlan(ctx context.Context, target Target, client *scyllaclient.Client) (*
106102
Table: t,
107103
ReplicaSetCnt: replicaSetCnt,
108104
RangesCnt: rangesCnt,
109-
FullRepair: fullRepair,
110105
})
111106
}
112107

113108
if len(tables) > 0 {
114109
ks = append(ks, keyspacePlan{
115110
Keyspace: u.Keyspace,
111+
Tablet: ringDescriber.IsTabletKeyspace(u.Keyspace),
116112
Tables: tables,
117113
})
118114
}
@@ -381,8 +377,7 @@ type apiSupport struct {
381377
// 'small_table_optimization' query param.
382378
smallTableRepair bool
383379
// If /storage_service/tablets/repair API is exposed.
384-
// It might not necessarily allow for host filtering.
385-
tabletRepairNoHostFiltering bool
380+
tabletRepair bool
386381
}
387382

388383
func getRepairAPISupport(ctx context.Context, client *scyllaclient.Client, hosts []string) (apiSupport, error) {
@@ -408,7 +403,7 @@ func getRepairAPISupport(ctx context.Context, client *scyllaclient.Client, hosts
408403
smallTableOpt.Store(false)
409404
}
410405

411-
res, err = ni.SupportsTabletRepairNoHostFiltering()
406+
res, err = ni.SupportsTabletRepair()
412407
if err != nil {
413408
return err
414409
}
@@ -424,7 +419,7 @@ func getRepairAPISupport(ctx context.Context, client *scyllaclient.Client, hosts
424419
return apiSupport{}, err
425420
}
426421
return apiSupport{
427-
smallTableRepair: smallTableOpt.Load(),
428-
tabletRepairNoHostFiltering: fullTabletTableOpt.Load(),
422+
smallTableRepair: smallTableOpt.Load(),
423+
tabletRepair: fullTabletTableOpt.Load(),
429424
}, nil
430425
}

0 commit comments

Comments
 (0)