Skip to content

Commit 44d2148

Browse files
authored
dashboard: show quorum status, age, old replicas (#10)
1 parent ba892f2 commit 44d2148

File tree

5 files changed

+69
-26
lines changed

5 files changed

+69
-26
lines changed

Cargo.toml

+1
Original file line numberDiff line numberDiff line change
@@ -10,6 +10,7 @@ axum = "0.7.7"
1010
gethostname = "0.5.0"
1111
log = "0.4.22"
1212
prost = "0.13.3"
13+
prost-types = "0.13.3"
1314
pyo3 = {version="0.22.3", features = ["extension-module"]}
1415
slog = "2.7.0"
1516
slog-stdlog = "4.1.1"

proto/torchft.proto

+3
Original file line numberDiff line numberDiff line change
@@ -7,6 +7,8 @@
77
syntax = "proto3";
88
package torchft;
99

10+
import "google/protobuf/timestamp.proto";
11+
1012
message RaftMessageRequest {
1113
// Request message contains the serialized Raft proto message.
1214
bytes message = 1;
@@ -43,6 +45,7 @@ message QuorumMember {
4345
message Quorum {
4446
int64 quorum_id = 1;
4547
repeated QuorumMember participants = 2;
48+
google.protobuf.Timestamp created = 3;
4649
}
4750

4851
message LighthouseQuorumRequest {

src/lighthouse.rs

+51-22
Original file line numberDiff line numberDiff line change
@@ -8,7 +8,7 @@ use core::net::SocketAddr;
88
use std::collections::HashMap;
99
use std::sync::Arc;
1010
use std::time::Duration;
11-
use std::time::Instant;
11+
use std::time::{Instant, SystemTime};
1212

1313
use anyhow::{anyhow, Result};
1414
use askama::Template;
@@ -97,7 +97,8 @@ impl Lighthouse {
9797
})
9898
}
9999

100-
async fn quorum_valid(&self) -> bool {
100+
// Checks whether the quorum is valid and an explanation for the state.
101+
async fn quorum_valid(&self) -> (bool, String) {
101102
let state = self.state.lock().await;
102103

103104
let mut first_joined = Instant::now();
@@ -119,38 +120,43 @@ impl Lighthouse {
119120
}
120121

121122
if is_fast_quorum {
122-
info!("Fast quorum found!");
123-
return is_fast_quorum;
123+
return (is_fast_quorum, format!("Fast quorum found!"));
124124
}
125125
}
126126

127127
if state.participants.len() < self.opt.min_replicas as usize {
128-
info!(
129-
"No quorum, only have {} participants, need {}",
130-
state.participants.len(),
131-
self.opt.min_replicas
128+
return (
129+
false,
130+
format!(
131+
"No quorum, only have {} participants, need {}",
132+
state.participants.len(),
133+
self.opt.min_replicas
134+
),
132135
);
133-
return false;
134136
}
135137

136138
// Quorum is valid at this point but lets wait for stragglers.
137139

138140
if Instant::now().duration_since(first_joined)
139141
< Duration::from_millis(self.opt.join_timeout_ms)
140142
{
141-
info!(
142-
"Valid quorum with {} participants, waiting for stragglers due to join timeout",
143-
state.participants.len()
143+
return (
144+
false,
145+
format!(
146+
"Valid quorum with {} participants, waiting for stragglers due to join timeout",
147+
state.participants.len()
148+
),
144149
);
145-
return false;
146150
}
147151

148-
true
152+
(true, format!("Valid quorum found"))
149153
}
150154

151155
async fn _quorum_tick(self: Arc<Self>) -> Result<()> {
152156
// TODO: these should probably run under the same lock
153-
let quorum_met = self.quorum_valid().await;
157+
let (quorum_met, reason) = self.quorum_valid().await;
158+
info!("{}", reason);
159+
154160
if quorum_met {
155161
let mut state = self.state.lock().await;
156162
let mut participants: Vec<QuorumMember> = state
@@ -180,6 +186,7 @@ impl Lighthouse {
180186
let quorum = Quorum {
181187
quorum_id: state.quorum_id,
182188
participants: participants,
189+
created: Some(SystemTime::now().into()),
183190
};
184191

185192
info!("Quorum! {:?}", quorum);
@@ -257,16 +264,33 @@ impl Lighthouse {
257264
}
258265

259266
async fn get_status(self: Arc<Self>) -> Html<String> {
267+
let (_, quorum_status) = self.quorum_valid().await;
268+
260269
let template = {
261270
let state = self.state.lock().await;
262271

272+
let max_step = {
273+
if let Some(quorum) = state.prev_quorum.clone() {
274+
quorum
275+
.participants
276+
.iter()
277+
.map(|p| p.step)
278+
.max()
279+
.unwrap_or(-1)
280+
} else {
281+
-1
282+
}
283+
};
284+
263285
StatusTemplate {
264286
quorum_id: state.quorum_id,
265287
prev_quorum: state.prev_quorum.clone(),
266288
heartbeats: state.heartbeats.clone(),
289+
quorum_status: quorum_status,
267290
old_age_threshold: Instant::now()
268291
.checked_sub(Duration::from_secs(1))
269292
.unwrap_or(Instant::now()),
293+
max_step: max_step,
270294
}
271295
};
272296
Html(template.render().unwrap())
@@ -361,10 +385,14 @@ struct IndexTemplate {}
361385
#[derive(Template)]
362386
#[template(path = "status.html")]
363387
struct StatusTemplate {
364-
old_age_threshold: Instant,
365388
prev_quorum: Option<Quorum>,
366389
quorum_id: i64,
367390
heartbeats: HashMap<String, Instant>,
391+
quorum_status: String,
392+
393+
// visualization thresholds
394+
old_age_threshold: Instant,
395+
max_step: i64,
368396
}
369397

370398
// Make our own error that wraps `anyhow::Error`.
@@ -422,7 +450,7 @@ mod tests {
422450
#[tokio::test]
423451
async fn test_quorum_join_timeout() {
424452
let lighthouse = lighthouse_test_new();
425-
assert!(!lighthouse.quorum_valid().await);
453+
assert!(!lighthouse.quorum_valid().await.0);
426454

427455
{
428456
let mut state = lighthouse.state.lock().await;
@@ -440,21 +468,21 @@ mod tests {
440468
);
441469
}
442470

443-
assert!(!lighthouse.quorum_valid().await);
471+
assert!(!lighthouse.quorum_valid().await.0);
444472

445473
{
446474
let mut state = lighthouse.state.lock().await;
447475
state.participants.get_mut("a").unwrap().joined =
448476
Instant::now().sub(Duration::from_secs(10 * 60 * 60));
449477
}
450478

451-
assert!(lighthouse.quorum_valid().await);
479+
assert!(lighthouse.quorum_valid().await.0);
452480
}
453481

454482
#[tokio::test]
455483
async fn test_quorum_fast_prev_quorum() {
456484
let lighthouse = lighthouse_test_new();
457-
assert!(!lighthouse.quorum_valid().await);
485+
assert!(!lighthouse.quorum_valid().await.0);
458486

459487
{
460488
let mut state = lighthouse.state.lock().await;
@@ -472,7 +500,7 @@ mod tests {
472500
);
473501
}
474502

475-
assert!(!lighthouse.quorum_valid().await);
503+
assert!(!lighthouse.quorum_valid().await.0);
476504

477505
{
478506
let mut state = lighthouse.state.lock().await;
@@ -484,10 +512,11 @@ mod tests {
484512
store_address: "".to_string(),
485513
step: 1,
486514
}],
515+
created: Some(SystemTime::now().into()),
487516
});
488517
}
489518

490-
assert!(lighthouse.quorum_valid().await);
519+
assert!(lighthouse.quorum_valid().await.0);
491520
}
492521

493522
#[tokio::test]

templates/index.html

+3
Original file line numberDiff line numberDiff line change
@@ -40,6 +40,9 @@
4040
padding: 10px;
4141
border: 1px solid #333;
4242
}
43+
.member.recovering {
44+
background-color: orange;
45+
}
4346
.heartbeat.old {
4447
color: red;
4548
}

templates/status.html

+11-4
Original file line numberDiff line numberDiff line change
@@ -1,16 +1,21 @@
11
<h2>Quorum Status</h2>
22

3-
Current quorum_id: {{quorum_id}}
3+
Current quorum_id: {{quorum_id}} <br>
4+
Next quorum status: {{quorum_status}}
45

56
<h3>Previous Quorum</h3>
67
{% if let Some(prev_quorum) = prev_quorum %}
78

8-
Previous quorum id: {{prev_quorum.quorum_id}}
9+
Previous quorum id: {{prev_quorum.quorum_id}} <br>
10+
Quorum age:
11+
{{SystemTime::try_from(prev_quorum.created.unwrap()).unwrap().elapsed().unwrap().as_secs_f64()}}s
912

1013
<div>
1114
{% for member in prev_quorum.participants %}
1215

13-
<div class="member">
16+
<div class="member
17+
{% if member.step != max_step %}recovering{% endif %}
18+
">
1419
<b>{{ member.replica_id }}</b> <br/>
1520
Step: {{ member.step }} <br/>
1621
Manager: {{ member.address }} <br/>
@@ -33,7 +38,9 @@ <h3>Heartbeats</h3>
3338
{% for replica_id in heartbeats.keys() %}
3439

3540
{% let age = heartbeats[replica_id].elapsed().as_secs_f64() %}
36-
<li class="heartbeat {% if heartbeats[replica_id].lt(old_age_threshold) %}old{%endif%}">
41+
<li class="heartbeat
42+
{% if heartbeats[replica_id].lt(old_age_threshold) %}old{%endif%}
43+
">
3744
{{ replica_id }}: seen {{ age }}s ago
3845
</li>
3946

0 commit comments

Comments
 (0)