Skip to content

Commit ccc1cc4

Browse files
Ahmet OeztuerkAhmet Oeztuerk
authored andcommitted
stale decetion uses retry_interval when needed
revise the stale detection warning, - now is visible even if there arent any dependencies. - displays the last checked time - displays additional hints about core and gearman
1 parent 0aab695 commit ccc1cc4

2 files changed

Lines changed: 46 additions & 6 deletions

File tree

lib/Thruk/Controller/extinfo.pm

Lines changed: 33 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1050,17 +1050,48 @@ sub _check_stale_check {
10501050

10511051
return(0) unless $obj->{'in_check_period'};
10521052

1053-
# do dependencies exist
1054-
return(0) unless(scalar @{$obj->{'depends_exec'}||[]} > 0 || scalar @{$obj->{'parents'}||[]} > 0);
1053+
# stalement requirement:
1054+
# from the last check time, a next check is scheduled
1055+
# if from that next check, hypotetical second next check is scheduled as well
1056+
# if the second next check lies in the past, the service is marked stale, as it missed two planned checks
10551057

10561058
my $peer_key = $obj->{'peer_key'};
10571059
my $check_interval = $obj->{'check_interval'} * $c->stash->{'pi_detail'}->{$peer_key}->{'interval_length'};
1060+
my $retry_interval = $obj->{'retry_interval'} * $c->stash->{'pi_detail'}->{$peer_key}->{'interval_length'};
1061+
my $max_check_attempts = $obj->{'max_check_attempts'};
1062+
my $current_attempt = $obj->{'current_attempt'};
1063+
my $state = $obj->{'state'};
1064+
my $last_check = $obj->{'last_check'}; # Last time the check got an answer
1065+
# obj.next_check is refreshed, even when there hasnt been any responses for a while.
1066+
# Staleness detection is based on last_check, next_check does not help
1067+
1068+
my $next_planned_check = 0;
1069+
if ($state == 0) {
1070+
$next_planned_check = $last_check + $check_interval;
1071+
}
1072+
elsif ($state != 0 && $current_attempt != $max_check_attempts) {
1073+
$next_planned_check = $last_check + $check_interval;
1074+
}
1075+
else{
1076+
$next_planned_check = $last_check + $retry_interval;
1077+
}
1078+
my $second_next_planned_check = 0;
1079+
if ($state == 0) {
1080+
$second_next_planned_check = $next_planned_check + $check_interval;
1081+
}
1082+
elsif ($state != 0 && $current_attempt != $max_check_attempts) {
1083+
$second_next_planned_check = $next_planned_check + $check_interval;
1084+
}
1085+
else{
1086+
$second_next_planned_check = $next_planned_check + $retry_interval;
10581087

10591088
# wait at least twice of the normal check interval
10601089
if($obj->{'last_check'} > time() - $check_interval * 2) {
10611090
return(0);
10621091
}
10631092

1093+
return(0) if $second_next_planned_check > time();
1094+
10641095
# did any of the parents fail?
10651096
my $worst = 0;
10661097
for my $parent (values %{$nodes}) {

templates/_extinfo_host_service_details.tt

Lines changed: 13 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -111,10 +111,19 @@ END
111111
[% IF stale_hint %]
112112
<div class="card alert w-auto red relative shadow-none flexcol gap-1 flex-nowrap justify-center p-2 m-2">
113113
<h3>Stale [% type | html %] detected</h3>
114-
<div>
115-
This [% type | html %] has not been checked recently. Have a look at the
116-
<a href="[% uri_with(c, { type => 'dtree' }) %]" class="link font-bold">dependency tree <i class="uil uil-share-alt align-middle"></i></a>
117-
to get a hint.
114+
<div class="whitespace-pre-line">
115+
<p class="mb-2">
116+
This [% type | html %] has likely missed two scheduled checks in a row according to its last check time: [% date.format(obj.last_check) %]
117+
</p>
118+
<p class="mb-2">
119+
This could be a dependency issue, you can check the <a href="[% uri_with(c, { type => 'dtree' }) %]" class="link font-bold">dependency tree</a> to confirm this.
120+
</p>
121+
<p>
122+
If rescheduling a manual check works and last update is fresh, it is likely that the core is scheduling checks properly but not getting results.
123+
</p>
124+
<p>
125+
If Gearman is used, it could that gearman workers are down or overloaded beyond the point where they can follow the schedule.
126+
</p>
118127
</div>
119128
</div>
120129
[% END %]

0 commit comments

Comments
 (0)