Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
16 commits
Select commit Hold shift + click to select a range
6509b9b
feat(scyllaclient): make host args optional for tablet repair
Michal-Leszczynski Nov 24, 2025
0ba2db9
refactor(repair): move incremental repair mode definition to scyllacl…
Michal-Leszczynski Nov 24, 2025
2943926
refactor(repair): move colocated repair error check to scyllaclient
Michal-Leszczynski Nov 25, 2025
ac1b0e7
feat(scyllaclient): don't retry on colocated table tablet repair err
Michal-Leszczynski Nov 27, 2025
fe18fd7
refactor(repair_test): move stop/startNode to testhelper pkg
Michal-Leszczynski Nov 27, 2025
a6d187c
feat(testing/makefile): use tablets_mode_for_new_keyspaces next to en…
Michal-Leszczynski Nov 27, 2025
fdf9d4d
feat(schema): add tablet_repair_run_progress
Michal-Leszczynski Nov 25, 2025
79afeff
feat(swagger): manager, add tablet repair progress definition
Michal-Leszczynski Nov 25, 2025
c9bb97e
feat(tablet_repair): implement tablet repair svc
Michal-Leszczynski Nov 25, 2025
38b8986
feat(tablet_repair): plug tablet repair into scheduler and api
Michal-Leszczynski Nov 25, 2025
bba0159
feat(.github): run tablet repair tests in CI
Michal-Leszczynski Nov 27, 2025
ad81961
feat(tablet_repair): handle scylla tablet repair task aborts
Michal-Leszczynski Dec 2, 2025
0df585f
s: swagger: add object field in SM swagger
Michal-Leszczynski Dec 3, 2025
3a3da45
s: abort: log with provided ctx during abort
Michal-Leszczynski Dec 3, 2025
bc9d8e1
s: implement svc: bump copyright year
Michal-Leszczynski Dec 3, 2025
61bcd5b
s: implement svc: add "task" label to metrics
Michal-Leszczynski Dec 3, 2025
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 3 additions & 0 deletions .github/cfg/integration-test-core.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -93,6 +93,9 @@ jobs:
- name: Run tests
run: make pkg-integration-test IP_FAMILY=${{ env.ip-family }} SSL_ENABLED=${{ env.ssl-enabled}} PKG=./pkg/service/repair

- name: Run tablet repair tests
run: make pkg-integration-test IP_FAMILY=${{ env.ip-family }} SSL_ENABLED=${{ env.ssl-enabled}} PKG=./pkg/service/repair/tablet

small-pkg:
name: Test other, smaller packages
runs-on: ubuntu-latest
Expand Down
2 changes: 2 additions & 0 deletions .github/workflows/integration-tests-2024.1.21-IPV4.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -40,6 +40,8 @@ jobs:
tablets: ${{ env.tablets }}
- name: Run tests
run: make pkg-integration-test IP_FAMILY=${{ env.ip-family }} SSL_ENABLED=${{ env.ssl-enabled}} PKG=./pkg/service/repair
- name: Run tablet repair tests
run: make pkg-integration-test IP_FAMILY=${{ env.ip-family }} SSL_ENABLED=${{ env.ssl-enabled}} PKG=./pkg/service/repair/tablet
restore-schema:
name: Test restore schema
runs-on: ubuntu-latest
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -40,6 +40,8 @@ jobs:
tablets: ${{ env.tablets }}
- name: Run tests
run: make pkg-integration-test IP_FAMILY=${{ env.ip-family }} SSL_ENABLED=${{ env.ssl-enabled}} PKG=./pkg/service/repair
- name: Run tablet repair tests
run: make pkg-integration-test IP_FAMILY=${{ env.ip-family }} SSL_ENABLED=${{ env.ssl-enabled}} PKG=./pkg/service/repair/tablet
restore-schema:
name: Test restore schema
runs-on: ubuntu-latest
Expand Down
2 changes: 2 additions & 0 deletions .github/workflows/integration-tests-2025.1.9-IPV4.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -40,6 +40,8 @@ jobs:
tablets: ${{ env.tablets }}
- name: Run tests
run: make pkg-integration-test IP_FAMILY=${{ env.ip-family }} SSL_ENABLED=${{ env.ssl-enabled}} PKG=./pkg/service/repair
- name: Run tablet repair tests
run: make pkg-integration-test IP_FAMILY=${{ env.ip-family }} SSL_ENABLED=${{ env.ssl-enabled}} PKG=./pkg/service/repair/tablet
restore-schema:
name: Test restore schema
runs-on: ubuntu-latest
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -40,6 +40,8 @@ jobs:
tablets: ${{ env.tablets }}
- name: Run tests
run: make pkg-integration-test IP_FAMILY=${{ env.ip-family }} SSL_ENABLED=${{ env.ssl-enabled}} PKG=./pkg/service/repair
- name: Run tablet repair tests
run: make pkg-integration-test IP_FAMILY=${{ env.ip-family }} SSL_ENABLED=${{ env.ssl-enabled}} PKG=./pkg/service/repair/tablet
restore-schema:
name: Test restore schema
runs-on: ubuntu-latest
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -42,6 +42,8 @@ jobs:
tablets: ${{ env.tablets }}
- name: Run tests
run: make pkg-integration-test IP_FAMILY=${{ env.ip-family }} SSL_ENABLED=${{ env.ssl-enabled}} PKG=./pkg/service/repair
- name: Run tablet repair tests
run: make pkg-integration-test IP_FAMILY=${{ env.ip-family }} SSL_ENABLED=${{ env.ssl-enabled}} PKG=./pkg/service/repair/tablet
restore-schema:
name: Test restore schema
runs-on: ubuntu-latest
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -42,6 +42,8 @@ jobs:
tablets: ${{ env.tablets }}
- name: Run tests
run: make pkg-integration-test IP_FAMILY=${{ env.ip-family }} SSL_ENABLED=${{ env.ssl-enabled}} PKG=./pkg/service/repair
- name: Run tablet repair tests
run: make pkg-integration-test IP_FAMILY=${{ env.ip-family }} SSL_ENABLED=${{ env.ssl-enabled}} PKG=./pkg/service/repair/tablet
restore-schema:
name: Test restore schema
runs-on: ubuntu-latest
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -42,6 +42,8 @@ jobs:
tablets: ${{ env.tablets }}
- name: Run tests
run: make pkg-integration-test IP_FAMILY=${{ env.ip-family }} SSL_ENABLED=${{ env.ssl-enabled}} PKG=./pkg/service/repair
- name: Run tablet repair tests
run: make pkg-integration-test IP_FAMILY=${{ env.ip-family }} SSL_ENABLED=${{ env.ssl-enabled}} PKG=./pkg/service/repair/tablet
restore-schema:
name: Test restore schema
runs-on: ubuntu-latest
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -42,6 +42,8 @@ jobs:
tablets: ${{ env.tablets }}
- name: Run tests
run: make pkg-integration-test IP_FAMILY=${{ env.ip-family }} SSL_ENABLED=${{ env.ssl-enabled}} PKG=./pkg/service/repair
- name: Run tablet repair tests
run: make pkg-integration-test IP_FAMILY=${{ env.ip-family }} SSL_ENABLED=${{ env.ssl-enabled}} PKG=./pkg/service/repair/tablet
restore-schema:
name: Test restore schema
runs-on: ubuntu-latest
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -42,6 +42,8 @@ jobs:
tablets: ${{ env.tablets }}
- name: Run tests
run: make pkg-integration-test IP_FAMILY=${{ env.ip-family }} SSL_ENABLED=${{ env.ssl-enabled}} PKG=./pkg/service/repair
- name: Run tablet repair tests
run: make pkg-integration-test IP_FAMILY=${{ env.ip-family }} SSL_ENABLED=${{ env.ssl-enabled}} PKG=./pkg/service/repair/tablet
restore-schema:
name: Test restore schema
runs-on: ubuntu-latest
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -42,6 +42,8 @@ jobs:
tablets: ${{ env.tablets }}
- name: Run tests
run: make pkg-integration-test IP_FAMILY=${{ env.ip-family }} SSL_ENABLED=${{ env.ssl-enabled}} PKG=./pkg/service/repair
- name: Run tablet repair tests
run: make pkg-integration-test IP_FAMILY=${{ env.ip-family }} SSL_ENABLED=${{ env.ssl-enabled}} PKG=./pkg/service/repair/tablet
restore-schema:
name: Test restore schema
runs-on: ubuntu-latest
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -42,6 +42,8 @@ jobs:
tablets: ${{ env.tablets }}
- name: Run tests
run: make pkg-integration-test IP_FAMILY=${{ env.ip-family }} SSL_ENABLED=${{ env.ssl-enabled}} PKG=./pkg/service/repair
- name: Run tablet repair tests
run: make pkg-integration-test IP_FAMILY=${{ env.ip-family }} SSL_ENABLED=${{ env.ssl-enabled}} PKG=./pkg/service/repair/tablet
restore-schema:
name: Test restore schema
runs-on: ubuntu-latest
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -42,6 +42,8 @@ jobs:
tablets: ${{ env.tablets }}
- name: Run tests
run: make pkg-integration-test IP_FAMILY=${{ env.ip-family }} SSL_ENABLED=${{ env.ssl-enabled}} PKG=./pkg/service/repair
- name: Run tablet repair tests
run: make pkg-integration-test IP_FAMILY=${{ env.ip-family }} SSL_ENABLED=${{ env.ssl-enabled}} PKG=./pkg/service/repair/tablet
restore-schema:
name: Test restore schema
runs-on: ubuntu-latest
Expand Down
3 changes: 3 additions & 0 deletions pkg/cmd/scylla-manager/server.go
Original file line number Diff line number Diff line change
Expand Up @@ -102,6 +102,7 @@ func (s *server) makeServices(ctx context.Context) error {
s.session,
s.config.Repair,
metrics.NewRepairMetrics().MustRegister(),
metrics.NewTabletRepairMetrics().MustRegister(),
s.clusterSvc.Client,
s.clusterSvc.GetSession,
s.configCacheSvc,
Expand Down Expand Up @@ -177,6 +178,8 @@ func (s *server) makeServices(ctx context.Context) error {
s.schedSvc.SetRunner(scheduler.HealthCheckTask, s.healthSvc.Runner())
s.schedSvc.SetRunner(scheduler.RepairTask,
scheduler.PolicyRunner{Policy: restoreExclusiveLock, Runner: s.repairSvc.Runner(), TaskType: scheduler.RepairTask})
s.schedSvc.SetRunner(scheduler.TabletRepairTask,
scheduler.PolicyRunner{Policy: restoreExclusiveLock, Runner: s.repairSvc.TabletService, TaskType: scheduler.TabletRepairTask})
s.schedSvc.SetRunner(scheduler.ValidateBackupTask, s.backupSvc.ValidationRunner())

// Add additional properties on task run.
Expand Down
52 changes: 52 additions & 0 deletions pkg/metrics/tabletrepair.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,52 @@
// Copyright (C) 2025 ScyllaDB

package metrics

import (
"github.com/prometheus/client_golang/prometheus"
"github.com/scylladb/scylla-manager/v3/pkg/util/uuid"
)

// TabletRepairMetrics describes metrics of tablet repair task.
type TabletRepairMetrics struct {
tableProgress *prometheus.GaugeVec
}

// NewTabletRepairMetrics creates new TabletRepairMetrics.
func NewTabletRepairMetrics() TabletRepairMetrics {
g := gaugeVecCreator("tablet_repair")

return TabletRepairMetrics{
tableProgress: g("Tablet repair progress in percents (0-100).", "progress", "cluster", "task", "keyspace", "table"),
}
}

func (m TabletRepairMetrics) all() []prometheus.Collector {
return []prometheus.Collector{
m.tableProgress,
}
}

// MustRegister shall be called to make the metrics visible by prometheus client.
func (m TabletRepairMetrics) MustRegister() TabletRepairMetrics {
prometheus.MustRegister(m.all()...)
return m
}

// ResetClusterMetrics resets all metrics labeled with the cluster.
func (m TabletRepairMetrics) ResetClusterMetrics(clusterID uuid.UUID) {
for _, c := range m.all() {
setGaugeVecMatching(c.(*prometheus.GaugeVec), unspecifiedValue, clusterMatcher(clusterID))
}
}

// SetTableProgress updates "progress" metric.
func (m TabletRepairMetrics) SetTableProgress(clusterID, taskID uuid.UUID, keyspace, table string, progress float64) {
l := prometheus.Labels{
"cluster": clusterID.String(),
"task": taskID.String(),
"keyspace": keyspace,
"table": table,
}
m.tableProgress.With(l).Set(progress)
}
31 changes: 31 additions & 0 deletions pkg/restapi/mock_repairservice_test.go

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

4 changes: 4 additions & 0 deletions pkg/restapi/services.go
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,7 @@ import (
"github.com/scylladb/scylla-manager/v3/pkg/service/healthcheck"
"github.com/scylladb/scylla-manager/v3/pkg/service/one2onerestore"
"github.com/scylladb/scylla-manager/v3/pkg/service/repair"
"github.com/scylladb/scylla-manager/v3/pkg/service/repair/tablet"
"github.com/scylladb/scylla-manager/v3/pkg/service/restore"
"github.com/scylladb/scylla-manager/v3/pkg/service/scheduler"
"github.com/scylladb/scylla-manager/v3/pkg/util/query"
Expand Down Expand Up @@ -57,6 +58,9 @@ type RepairService interface {
GetTarget(ctx context.Context, clusterID uuid.UUID, properties json.RawMessage) (repair.Target, error)
SetIntensity(ctx context.Context, runID uuid.UUID, intensity float64) error
SetParallel(ctx context.Context, runID uuid.UUID, parallel int) error

GetTabletTarget(ctx context.Context, clusterID uuid.UUID, properties json.RawMessage) (tablet.Target, error)
GetTabletProgress(ctx context.Context, clusterID, taskID, runID uuid.UUID) (tablet.Progress, error)
}

// BackupService service interface for the REST API handlers.
Expand Down
9 changes: 9 additions & 0 deletions pkg/restapi/task.go
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,7 @@ import (
"github.com/scylladb/scylla-manager/v3/pkg/service/backup"
"github.com/scylladb/scylla-manager/v3/pkg/service/one2onerestore"
"github.com/scylladb/scylla-manager/v3/pkg/service/repair"
"github.com/scylladb/scylla-manager/v3/pkg/service/repair/tablet"
"github.com/scylladb/scylla-manager/v3/pkg/service/restore"
"github.com/scylladb/scylla-manager/v3/pkg/service/scheduler"
"github.com/scylladb/scylla-manager/v3/pkg/util"
Expand Down Expand Up @@ -265,6 +266,10 @@ func (h *taskHandler) validateTask(ctx context.Context, newTask *scheduler.Task,
if _, err := h.Repair.GetTarget(ctx, newTask.ClusterID, p); err != nil {
return errors.Wrap(err, "create repair target")
}
case scheduler.TabletRepairTask:
if _, err := h.Repair.GetTabletTarget(ctx, newTask.ClusterID, p); err != nil {
return errors.Wrap(err, "create tablet repair target")
}
case scheduler.ValidateBackupTask:
if _, err := h.Backup.GetValidationTarget(ctx, newTask.ClusterID, p); err != nil {
return errors.Wrap(err, "create backup validation target")
Expand Down Expand Up @@ -471,6 +476,8 @@ func (h *taskHandler) taskRunProgress(w http.ResponseWriter, r *http.Request) {
switch t.Type {
case scheduler.RepairTask:
prog.Progress = repair.Progress{}
case scheduler.TabletRepairTask:
prog.Progress = tablet.Progress{}
case scheduler.BackupTask:
prog.Progress = backup.Progress{}
case scheduler.RestoreTask:
Expand Down Expand Up @@ -507,6 +514,8 @@ func (h *taskHandler) taskRunProgress(w http.ResponseWriter, r *http.Request) {
switch t.Type {
case scheduler.RepairTask:
pr, err = h.Repair.GetProgress(r.Context(), t.ClusterID, t.ID, prog.Run.ID)
case scheduler.TabletRepairTask:
pr, err = h.Repair.GetTabletProgress(r.Context(), t.ClusterID, t.ID, prog.Run.ID)
case scheduler.BackupTask:
pr, err = h.Backup.GetProgress(r.Context(), t.ClusterID, t.ID, prog.Run.ID)
case scheduler.RestoreTask:
Expand Down
23 changes: 23 additions & 0 deletions pkg/schema/table/table.go

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

Loading
Loading