Skip to content

Commit ea52376

Browse files
authored
runaway: fix the issue where COOLDOWN/SWITCH_GROUP can't be triggered (pingcap#60457)
close pingcap#60404
1 parent c9b33d6 commit ea52376

File tree

2 files changed

+43
-33
lines changed

2 files changed

+43
-33
lines changed

pkg/executor/adapter.go

Lines changed: 1 addition & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -562,12 +562,7 @@ func (a *ExecStmt) Exec(ctx context.Context) (_ sqlexec.RecordSet, err error) {
562562
return nil, err
563563
}
564564
if len(switchGroupName) > 0 {
565-
group, err := rm.ResourceGroupCtl.GetResourceGroup(switchGroupName)
566-
if err != nil || group == nil {
567-
logutil.BgLogger().Debug("invalid switch resource group", zap.String("switch-group-name", switchGroupName), zap.Error(err))
568-
} else {
569-
stmtCtx.ResourceGroupName = switchGroupName
570-
}
565+
stmtCtx.ResourceGroupName = switchGroupName
571566
}
572567
}
573568
ctx = a.observeStmtBeginForTopSQL(ctx)

pkg/resourcegroup/runaway/checker.go

Lines changed: 42 additions & 27 deletions
Original file line numberDiff line numberDiff line change
@@ -112,6 +112,13 @@ func (rm *Manager) DeriveChecker(resourceGroupName, originalSQL, sqlDigest, plan
112112
return NewChecker(rm, resourceGroupName, group.RunawaySettings, originalSQL, sqlDigest, planDigest, startTime)
113113
}
114114

115+
func (r *Checker) isMarkedByIdentifyInRunawaySettings() bool {
116+
if r == nil {
117+
return false
118+
}
119+
return r.markedByIdentifyInRunawaySettings.Load()
120+
}
121+
115122
// BeforeExecutor checks whether query is in watch list before executing and after compiling.
116123
func (r *Checker) BeforeExecutor() (string, error) {
117124
if r == nil {
@@ -149,34 +156,43 @@ func (r *Checker) BeforeExecutor() (string, error) {
149156
return "", nil
150157
case rmpb.RunawayAction_SwitchGroup:
151158
// Return the switch group name to switch the resource group before executing.
152-
return switchGroupName, nil
159+
return r.checkSwitchGroupName(switchGroupName), nil
153160
default:
154161
// Continue to examine other convicts.
155162
}
156163
}
157164
return "", nil
158165
}
159166

167+
func (r *Checker) checkSwitchGroupName(groupName string) string {
168+
if len(groupName) == 0 {
169+
return ""
170+
}
171+
group, err := r.manager.ResourceGroupCtl.GetResourceGroup(groupName)
172+
if err != nil || group == nil {
173+
logutil.BgLogger().Debug("invalid switch resource group", zap.String("switch-group-name", groupName), zap.Error(err))
174+
return ""
175+
}
176+
return groupName
177+
}
178+
160179
// BeforeCopRequest checks runaway and modifies the request if necessary before sending coprocessor request.
161180
func (r *Checker) BeforeCopRequest(req *tikvrpc.Request) error {
162181
if r == nil {
163182
return nil
164183
}
165-
// If the group settings are not available, and it's not marked by watch, skip this part.
166-
if r.settings == nil && !r.markedByQueryWatchRule {
167-
return nil
168-
}
169-
// If it's marked by watch and the action is cooldown, override the priority,
184+
// If it's marked by watch and the action is cooldown, override the priority.
185+
// Other watch actions should already been handled in `BeforeExecutor` before.
170186
if r.markedByQueryWatchRule && r.watchAction == rmpb.RunawayAction_CoolDown {
171187
req.ResourceControlContext.OverridePriority = 1 // set priority to lowest
172188
}
173-
// If group settings are available and the query is not marked by a rule,
174-
// verify if it matches any rules in the settings.
175-
if r.settings != nil && !r.markedByIdentifyInRunawaySettings.Load() {
189+
// If group settings are available, verify if it matches any rules in the settings.
190+
if r.settings != nil {
176191
now := time.Now()
177-
// only check time and need to ensure deadline existed.
192+
// Check and ensure the deadline exists.
178193
exceedCause := r.exceedsThresholds(now, nil, 0)
179-
if exceedCause == "" { // only set timeout when the query is not runaway.
194+
// Only set timeout when the query has not been marked as runaway yet.
195+
if !r.isMarkedByIdentifyInRunawaySettings() && len(exceedCause) == 0 {
180196
if r.settings.Action == rmpb.RunawayAction_Kill {
181197
until := r.deadline.Sub(now)
182198
// if the execution time is close to the threshold, set a timeout
@@ -186,7 +202,8 @@ func (r *Checker) BeforeCopRequest(req *tikvrpc.Request) error {
186202
}
187203
return nil
188204
}
189-
// execution time exceeds the threshold, mark the query as runaway
205+
// Try to mark the query as runaway, it's safe to call this method concurrently.
206+
// So it's possible that the query has already been marked as runaway in `CheckThresholds`.
190207
r.markRunawayByIdentifyInRunawaySettings(&now, exceedCause)
191208
// Take action if needed.
192209
switch r.settings.Action {
@@ -196,7 +213,9 @@ func (r *Checker) BeforeCopRequest(req *tikvrpc.Request) error {
196213
req.ResourceControlContext.OverridePriority = 1 // set priority to lowest
197214
return nil
198215
case rmpb.RunawayAction_SwitchGroup:
199-
req.ResourceControlContext.ResourceGroupName = r.settings.SwitchGroupName
216+
if switchGroupName := r.checkSwitchGroupName(r.settings.SwitchGroupName); len(switchGroupName) != 0 {
217+
req.ResourceControlContext.ResourceGroupName = switchGroupName
218+
}
200219
return nil
201220
default:
202221
return nil
@@ -214,7 +233,7 @@ func (r *Checker) CheckAction() rmpb.RunawayAction {
214233
if r.markedByQueryWatchRule {
215234
return r.watchAction
216235
}
217-
if r.markedByIdentifyInRunawaySettings.Load() {
236+
if r.isMarkedByIdentifyInRunawaySettings() {
218237
return r.settings.Action
219238
}
220239
return rmpb.RunawayAction_NoneAction
@@ -227,7 +246,7 @@ func (r *Checker) CheckRuleKillAction() (string, bool) {
227246
return "", false
228247
}
229248
// If the group settings are available, and it's not marked by rule, check the execution time.
230-
if r.settings != nil && !r.markedByIdentifyInRunawaySettings.Load() {
249+
if r.settings != nil && !r.isMarkedByIdentifyInRunawaySettings() {
231250
now := time.Now()
232251
exceedCause := r.exceedsThresholds(now, nil, 0)
233252
if exceedCause == "" {
@@ -249,15 +268,14 @@ func (r *Checker) markQuarantine(now *time.Time, exceedCause string) {
249268
r.settings.Action, r.settings.SwitchGroupName, ttl, now, exceedCause)
250269
}
251270

252-
func (r *Checker) markRunawayByIdentifyInRunawaySettings(now *time.Time, exceedCause string) bool {
271+
func (r *Checker) markRunawayByIdentifyInRunawaySettings(now *time.Time, exceedCause string) {
253272
swapped := r.markedByIdentifyInRunawaySettings.CompareAndSwap(false, true)
254273
if swapped {
255274
r.markRunaway("identify", r.settings.Action, r.settings.SwitchGroupName, now, exceedCause)
256275
if !r.markedByQueryWatchRule {
257276
r.markQuarantine(now, exceedCause)
258277
}
259278
}
260-
return swapped
261279
}
262280

263281
func (r *Checker) markRunawayByQueryWatchRule(action rmpb.RunawayAction, switchGroupName, exceedCause string) {
@@ -320,7 +338,7 @@ func (r *Checker) CheckThresholds(ruDetail *util.RUDetails, processKeys int64, e
320338
processKeys = int64(100)
321339
}
322340
})
323-
if r.settings == nil || r.settings.Action != rmpb.RunawayAction_Kill {
341+
if r.settings == nil {
324342
return err
325343
}
326344

@@ -333,18 +351,15 @@ func (r *Checker) CheckThresholds(ruDetail *util.RUDetails, processKeys int64, e
333351
atomic.AddInt64(&r.totalProcessedKeys, processKeys)
334352
totalProcessedKeys := atomic.LoadInt64(&r.totalProcessedKeys)
335353
exceedCause := r.exceedsThresholds(checkTime, ruDetail, totalProcessedKeys)
336-
if !r.markedByIdentifyInRunawaySettings.Load() {
337-
if exceedCause != "" && r.markRunawayByIdentifyInRunawaySettings(&now, exceedCause) {
338-
if r.markRunawayByIdentifyInRunawaySettings(&now, exceedCause) {
339-
return exeerrors.ErrResourceGroupQueryRunawayInterrupted.FastGenByArgs(exceedCause)
340-
}
341-
}
354+
// No need to mark as runaway if the query is not exceeded any threshold.
355+
if len(exceedCause) == 0 {
356+
return err
342357
}
343-
// Due to concurrency, check again.
344-
if r.markedByIdentifyInRunawaySettings.Load() {
358+
r.markRunawayByIdentifyInRunawaySettings(&now, exceedCause)
359+
// Other actions will be handled in `BeforeCopRequest` since they need to modify the request.
360+
if r.settings.Action == rmpb.RunawayAction_Kill {
345361
return exeerrors.ErrResourceGroupQueryRunawayInterrupted.FastGenByArgs(exceedCause)
346362
}
347-
348363
return err
349364
}
350365

0 commit comments

Comments
 (0)