Skip to content

Commit 81143d6

Browse files
committed
Introduce Checkable#scheduler_shuffle_cap
1 parent 3276cc9 commit 81143d6

File tree

5 files changed

+30
-5
lines changed

5 files changed

+30
-5
lines changed

doc/09-object-types.md

+2
Original file line numberDiff line numberDiff line change
@@ -352,6 +352,7 @@ Configuration Attributes:
352352
check\_timeout | Duration | **Optional.** Check command timeout in seconds. Overrides the CheckCommand's `timeout` attribute.
353353
check\_interval | Duration | **Optional.** The check interval (in seconds). This interval is used for checks when the host is in a `HARD` state. Defaults to `5m`.
354354
retry\_interval | Duration | **Optional.** The retry interval (in seconds). This interval is used for checks when the host is in a `SOFT` state. Defaults to `1m`. Note: This does not affect the scheduling [after a passive check result](08-advanced-topics.md#check-result-freshness).
355+
scheduler\_shuffle\_cap | Number | **Optional.** Number of percent by up to which Icinga is allowed to override the check interval arbitrarily and in any direction to reduce load spikes. Defaults to 0.
355356
enable\_notifications | Boolean | **Optional.** Whether notifications are enabled. Defaults to true.
356357
enable\_active\_checks | Boolean | **Optional.** Whether active checks are enabled. Defaults to true.
357358
enable\_passive\_checks | Boolean | **Optional.** Whether passive checks are enabled. Defaults to true.
@@ -719,6 +720,7 @@ Configuration Attributes:
719720
check\_timeout | Duration | **Optional.** Check command timeout in seconds. Overrides the CheckCommand's `timeout` attribute.
720721
check\_interval | Duration | **Optional.** The check interval (in seconds). This interval is used for checks when the service is in a `HARD` state. Defaults to `5m`.
721722
retry\_interval | Duration | **Optional.** The retry interval (in seconds). This interval is used for checks when the service is in a `SOFT` state. Defaults to `1m`. Note: This does not affect the scheduling [after a passive check result](08-advanced-topics.md#check-result-freshness).
723+
scheduler\_shuffle\_cap | Number | **Optional.** Number of percent by up to which Icinga is allowed to override the check interval arbitrarily and in any direction to reduce load spikes. Defaults to 0.
722724
enable\_notifications | Boolean | **Optional.** Whether notifications are enabled. Defaults to `true`.
723725
enable\_active\_checks | Boolean | **Optional.** Whether active checks are enabled. Defaults to `true`.
724726
enable\_passive\_checks | Boolean | **Optional.** Whether passive checks are enabled. Defaults to `true`.

lib/icinga/checkable-check.cpp

+21-3
Original file line numberDiff line numberDiff line change
@@ -14,6 +14,7 @@
1414
#include "base/convert.hpp"
1515
#include "base/utility.hpp"
1616
#include "base/context.hpp"
17+
#include <cstdlib>
1718

1819
using namespace icinga;
1920

@@ -67,7 +68,7 @@ void Checkable::UpdateNextCheck(const MessageOrigin::Ptr& origin)
6768
if (adj != 0.0)
6869
adj = std::min(0.5 + fmod(GetSchedulingOffset(), interval * 5) / 100.0, adj);
6970

70-
double nextCheck = now - adj + interval;
71+
double nextCheck = now - adj + interval * GetIntervalShuffleFactor();
7172
double lastCheck = GetLastCheck();
7273

7374
Log(LogDebug, "Checkable")
@@ -372,7 +373,7 @@ Checkable::ProcessingResult Checkable::ProcessCheckResult(const CheckResult::Ptr
372373
if (ttl > 0)
373374
offset = ttl;
374375
else
375-
offset = GetCheckInterval();
376+
offset = GetCheckInterval() * GetIntervalShuffleFactor();
376377

377378
SetNextCheck(Utility::GetTime() + offset, false, origin);
378379
}
@@ -412,7 +413,7 @@ Checkable::ProcessingResult Checkable::ProcessCheckResult(const CheckResult::Ptr
412413
if (!parent->GetEnableActiveChecks())
413414
continue;
414415

415-
if (parent->GetNextCheck() >= now + parent->GetRetryInterval()) {
416+
if (parent->GetNextCheck() >= now + parent->GetRetryInterval() * parent->GetIntervalShuffleFactor()) {
416417
ObjectLock olock(parent);
417418
parent->SetNextCheck(now);
418419
}
@@ -707,3 +708,20 @@ void Checkable::AquirePendingCheckSlot(int maxPendingChecks)
707708

708709
m_PendingChecks++;
709710
}
711+
712+
/**
713+
* Returns a random factor derived from scheduler_shuffle_cap to multiply the check interval with.
714+
*
715+
* E.g. if scheduler_shuffle_cap is 20 (%), this function returns [0.8, 1.2].
716+
*/
717+
double Checkable::GetIntervalShuffleFactor()
718+
{
719+
if (!GetEnableActiveChecks()) {
720+
// scheduler_shuffle_cap doesn't influence external checkers.
721+
return 1;
722+
}
723+
724+
return (GetSchedulerShuffleCap() / 100) // scheduler_shuffle_cap as non-%, i.e. 10 => 0.1
725+
* (rand() / (double)RAND_MAX * 2 - 1) // random number [-1, 1]
726+
+ 1;
727+
}

lib/icinga/checkable.cpp

+3-2
Original file line numberDiff line numberDiff line change
@@ -9,6 +9,7 @@
99
#include "base/exception.hpp"
1010
#include "base/timer.hpp"
1111
#include <boost/thread/once.hpp>
12+
#include <cmath>
1213

1314
using namespace icinga;
1415

@@ -91,9 +92,9 @@ void Checkable::Start(bool runtimeCreated)
9192
}
9293

9394
if (GetNextCheck() < now + 60) {
94-
double delta = std::min(GetCheckInterval(), 60.0);
95+
double delta = std::min(GetCheckInterval() * GetIntervalShuffleFactor(), 60.0);
9596
delta *= (double)std::rand() / RAND_MAX;
96-
SetNextCheck(now + delta);
97+
SetNextCheck(now + delta + GetCheckInterval() * fabs(GetIntervalShuffleFactor() - 1));
9798
}
9899

99100
ObjectImpl<Checkable>::Start(runtimeCreated);

lib/icinga/checkable.hpp

+1
Original file line numberDiff line numberDiff line change
@@ -198,6 +198,7 @@ class Checkable : public ObjectImpl<Checkable>
198198
bool NotificationReasonApplies(NotificationType type);
199199
bool NotificationReasonSuppressed(NotificationType type);
200200
bool IsLikelyToBeCheckedSoon();
201+
double GetIntervalShuffleFactor();
201202

202203
void FireSuppressedNotifications();
203204

lib/icinga/checkable.ti

+3
Original file line numberDiff line numberDiff line change
@@ -47,6 +47,9 @@ abstract class Checkable : CustomVarObject
4747
[config] double retry_interval {
4848
default {{{ return 60; }}}
4949
};
50+
[config] double scheduler_shuffle_cap {
51+
default {{{ return 0.0; }}}
52+
};
5053
[config, navigation] name(EventCommand) event_command (EventCommandRaw) {
5154
navigate {{{
5255
return EventCommand::GetByName(GetEventCommandRaw());

0 commit comments

Comments
 (0)