Skip to content

Commit b368755

Browse files
chore(kms-connector): more granular metrics (#1476)
* chore(kms-connector): more granular metrics * docs(kms-connector): update metrics documentation
1 parent 6630c83 commit b368755

26 files changed

+541
-361
lines changed

docs/metrics/metrics.md

Lines changed: 67 additions & 62 deletions
Original file line numberDiff line numberDiff line change
@@ -214,121 +214,126 @@ Metrics for zkproof-worker are to be added in future releases, if/when needed. C
214214

215215
#### Metric Name: `kms_connector_gw_listener_event_received_counter`
216216
- **Type**: Counter
217+
- **Labels**:
218+
- `event_type`: can be used to filter by event type (public_decryption_request, user_decryption_request, crsgen_request, ...).
217219
- **Description**: Counts the number of events received by the GW listener.
218-
- **Alarm**: If the counter is a flat line over a period of time.
219-
- **Recommendation**: 0 for more than 1 minute, i.e. `increase(counter[1m]) == 0`.
220+
- **Alarm**: If the counter is a flat line over a period of time, only for `event_type` `public_decryption_request` and `user_decryption_request`.
221+
- **Recommendation**: 0 for more than 1 minute, i.e. `increase(counter{event_type="..."}[1m]) == 0`.
220222

221223
#### Metric Name: `kms_connector_gw_listener_event_received_errors`
222224
- **Type**: Counter
225+
- **Labels**:
226+
- `event_type`: see [description](#metric-name-kms_connector_gw_listener_event_received_counter)
223227
- **Description**: Counts the number of errors encountered by the GW listener while receiving events.
224228
- **Alarm**: If the counter increases over a period of time.
225-
- **Recommendation**: more than 60 failures in 1 minute, i.e. `increase(counter[1m]) > 60`.
226-
227-
#### Metric Name: `kms_connector_gw_listener_event_stored_counter`
228-
- **Type**: Counter
229-
- **Description**: Counts the number of events successfully stored in the DB by the GW listener.
230-
- **Alarm**: If the counter is a flat line over a period of time.
231-
- **Recommendation**: 0 for more than 1 minute, i.e. `increase(counter[1m]) == 0`.
232-
233-
#### Metric Name: `kms_connector_gw_listener_event_storage_errors`
234-
- **Type**: Counter
235-
- **Description**: Counts the number of errors encountered by the GW listener while storing events in the DB.
236-
- **Alarm**: If the counter increases over a period of time.
237-
- **Recommendation**: more than 60 failures in 1 minute, i.e. `increase(counter[1m]) > 60`.
229+
- **Recommendation**: more than 60 failures in 1 minute, i.e. `sum(increase(counter[1m])) > 60`.
238230

239231
### kms-worker
240232

241233
#### Metric Name: `kms_connector_worker_event_received_counter`
242234
- **Type**: Counter
235+
- **Labels**:
236+
- `event_type`: see [description](#metric-name-kms_connector_gw_listener_event_received_counter)
243237
- **Description**: Counts the number of events received by the KMS worker.
244-
- **Alarm**: If the counter is a flat line over a period of time.
245-
- **Recommendation**: 0 for more than 1 minute, i.e. `increase(counter[1m]) == 0`.
238+
- **Alarm**: If the counter is a flat line over a period of time, only for `event_type` `public_decryption_request` and `user_decryption_request`.
239+
- **Recommendation**: 0 for more than 1 minute, i.e. `increase(counter{event_type="..."}[1m]) == 0`.
246240

247241
#### Metric Name: `kms_connector_worker_event_received_errors`
248242
- **Type**: Counter
243+
- **Labels**:
244+
- `event_type`: see [description](#metric-name-kms_connector_gw_listener_event_received_counter)
249245
- **Description**: Counts the number of errors encountered while listening for events in the KMS worker.
250246
- **Alarm**: If the counter increases over a period of time.
251-
- **Recommendation**: more than 60 failures in 1 minute, i.e. `increase(counter[1m]) > 60`.
247+
- **Recommendation**: more than 60 failures in 1 minute, i.e. `sum(increase(counter[1m])) > 60`.
252248

253-
#### Metric Name: `kms_connector_worker_decryption_request_sent_counter`
249+
#### Metric Name: `kms_connector_worker_grpc_request_sent_counter`
254250
- **Type**: Counter
255-
- **Description**: Counts the number of decryption requests sent by the KMS worker to the KMS core.
256-
- **Alarm**: If the counter is a flat line over a period of time.
257-
- **Recommendation**: 0 for more than 1 minute, i.e. `increase(counter[1m]) == 0`.
251+
- **Labels**:
252+
- `event_type`: see [description](#metric-name-kms_connector_gw_listener_event_received_counter)
253+
- **Description**: Number of successful GRPC requests sent by the KMS worker to the KMS Core,
254+
- **Alarm**: If the counter is a flat line over a period of time, only for `event_type` `public_decryption_request` and `user_decryption_request`.
255+
- **Recommendation**: 0 for more than 1 minute, i.e. `increase(counter{event_type="..."}[1m]) == 0`.
258256

259-
#### Metric Name: `kms_connector_worker_decryption_request_sent_errors`
257+
#### Metric Name: `kms_connector_worker_grpc_request_sent_errors`
260258
- **Type**: Counter
261-
- **Description**: Counts the number of errors encountered by the KMS worker while sending decryption requests to the KMS core.
259+
- **Labels**:
260+
- `event_type`: see [description](#metric-name-kms_connector_gw_listener_event_received_counter)
261+
- **Description**: Counts the number of errors encountered by the KMS worker while sending grpc requests to the KMS Core.
262262
- **Alarm**: If the counter increases over a period of time.
263-
- **Recommendation**: more than 60 failures in 1 minute, i.e. `increase(counter[1m]) > 60`.
263+
- **Recommendation**: more than 60 failures in 1 minute, i.e. `sum(increase(counter[1m])) > 60`.
264264

265-
#### Metric Name: `kms_connector_worker_decryption_response_counter`
265+
#### Metric Name: `kms_connector_worker_grpc_response_polled_counter`
266266
- **Type**: Counter
267-
- **Description**: Counts the number of decryption responses received by the KMS worker from the KMS core.
268-
- **Alarm**: If the counter is a flat line over a period of time.
269-
- **Recommendation**: 0 for more than 1 minute, i.e. `increase(counter[1m]) == 0`.
267+
- **Labels**:
268+
- `event_type`: see [description](#metric-name-kms_connector_gw_listener_event_received_counter)
269+
- **Description**: Counts the number of responses successfully polled from the KMS Core via GRPC.
270+
- **Alarm**: If the counter is a flat line over a period of time, only for `event_type` `public_decryption_request` and `user_decryption_request`.
271+
- **Recommendation**: 0 for more than 1 minute, i.e. `increase(counter{event_type="..."}[1m]) == 0`.
270272

271-
#### Metric Name: `kms_connector_worker_decryption_response_errors`
273+
#### Metric Name: `kms_connector_worker_grpc_response_polled_errors`
272274
- **Type**: Counter
273-
- **Description**: Counts the number of errors encountered by the KMS worker while receiving decryption responses from the KMS core.
275+
- **Labels**:
276+
- `event_type`: see [description](#metric-name-kms_connector_gw_listener_event_received_counter)
277+
- **Description**: Counts the number of errors encountered by the KMS worker while polling responses from the KMS Core.
274278
- **Alarm**: If the counter increases over a period of time.
275-
- **Recommendation**: more than 60 failures in 1 minute, i.e. `increase(counter[1m]) > 60`.
276-
277-
#### Metric Name: `kms_connector_worker_key_management_request_sent_counter`
278-
- **Type**: Counter
279-
- **Description**: Counts the number of key management requests sent by the KMS worker to the KMS core.
280-
- **Alarm**: N/A - key management requests are infrequent events.
281-
282-
#### Metric Name: `kms_connector_worker_key_management_request_sent_errors`
283-
- **Type**: Counter
284-
- **Description**: Counts the number of errors encountered by the KMS worker while sending key management requests to the KMS core.
285-
- **Alarm**: If the counter increases from 0. Key management is an important event that should not fail.
286-
- **Recommendation**: alarm on any failures over a 1 minute period, i.e. `increase(counter[1m]) > 0`.
287-
288-
#### Metric Name: `kms_connector_worker_key_management_response_counter`
289-
- **Type**: Counter
290-
- **Description**: Counts the number of key management responses received by the KMS worker from the KMS core.
291-
- **Alarm**: N/A - key management responses are infrequent events.
292-
293-
#### Metric Name: `kms_connector_worker_key_management_response_errors`
294-
- **Type**: Counter
295-
- **Description**: Counts the number of errors encountered by the KMS worker while receiving key management responses from the KMS core.
296-
- **Alarm**: If the counter increases from 0. Key management is an important event that should not fail.
297-
- **Recommendation**: alarm on any failures over a 1 minute period, i.e. `increase(counter[1m]) > 0`.
279+
- **Recommendation**: more than 60 failures in 1 minute, i.e. `sum(increase(counter[1m])) > 60`.
298280

299281
#### Metric Name: `kms_connector_worker_s3_ciphertext_retrieval_counter`
300282
- **Type**: Counter
301283
- **Description**: Counts the number of ciphertexts retrieved by the KMS worker from S3.
302-
- **Alarm**: N/A - key management events are infrequent.
284+
- **Alarm**: If the counter is a flat line over a period of time.
285+
- **Recommendation**: 0 for more than 1 minute, i.e. `increase(counter[1m]) == 0`.
303286

304287
#### Metric Name: `kms_connector_worker_s3_ciphertext_retrieval_errors`
305288
- **Type**: Counter
306289
- **Description**: Counts the number of errors encountered by the KMS worker while retrieving ciphertexts from S3.
307290
- **Alarm**: If the counter increases over a period of time.
308-
- **Recommendation**: more than 60 failures in 1 minute, i.e. `increase(counter[1m]) > 60`.
291+
- **Recommendation**: more than 60 failures in 1 minute, i.e. `sum(increase(counter[1m])) > 60`.
309292

310293
### tx-sender
311294

312295
#### Metric Name: `kms_connector_tx_sender_response_received_counter`
313296
- **Type**: Counter
297+
- **Labels**:
298+
- `response_type`: can be used to filter by response type (public_decryption_response, user_decryption_response, crsgen_response, ...).
314299
- **Description**: Counts the number of responses received by the TX sender.
315-
- **Alarm**: If the counter is a flat line over a period of time.
316-
- **Recommendation**: 0 for more than 1 minute, i.e. `increase(counter[1m]) == 0`.
300+
- **Alarm**: If the counter is a flat line over a period of time, only for `response_type` `public_decryption_response` and `user_decryption_response`.
301+
- **Recommendation**: 0 for more than 1 minute, i.e. `increase(counter{response_type = "..."}[1m]) == 0`.
317302

318303
#### Metric Name: `kms_connector_tx_sender_response_received_errors`
319304
- **Type**: Counter
305+
- **Labels**:
306+
- `response_type`: see [description](#metric-name-kms_connector_tx_sender_response_received_counter)
320307
- **Description**: Counts the number of errors encountered by the TX sender while listening for responses.
321308
- **Alarm**: If the counter increases over a period of time.
322-
- **Recommendation**: more than 60 failures in 1 minute, i.e. `increase(counter[1m]) > 60`.
309+
- **Recommendation**: more than 60 failures in 1 minute, i.e. `sum(increase(counter[1m])) > 60`.
323310

324311
#### Metric Name: `kms_connector_tx_sender_gateway_tx_sent_counter`
325312
- **Type**: Counter
313+
- **Labels**:
314+
- `response_type`: see [description](#metric-name-kms_connector_tx_sender_response_received_counter)
326315
- **Description**: Counts the number of transactions sent to the Gateway by the TX sender.
327-
- **Alarm**: If the counter is a flat line over a period of time.
328-
- **Recommendation**: 0 for more than 1 minute, i.e. `increase(counter[1m]) == 0`.
316+
- **Alarm**: If the counter is a flat line over a period of time, only for `response_type` `public_decryption_response` and `user_decryption_response`.
317+
- **Recommendation**: 0 for more than 1 minute, i.e. `increase(counter{response_type = "..."}[1m]) == 0`.
329318

330319
#### Metric Name: `kms_connector_tx_sender_gateway_tx_sent_errors`
331320
- **Type**: Counter
321+
- **Labels**:
322+
- `response_type`: see [description](#metric-name-kms_connector_tx_sender_response_received_counter)
332323
- **Description**: Counts the number of errors encountered by the TX sender while sending transactions to the Gateway.
333324
- **Alarm**: If the counter increases over a period of time.
334-
- **Recommendation**: more than 60 failures in 1 minute, i.e. `increase(counter[1m]) > 60`.
325+
- **Recommendation**: more than 60 failures in 1 minute, i.e. `sum(increase(counter[1m])) > 60`.
326+
327+
#### Metric Name: `kms_connector_pending_events`
328+
- **Type**: Gauge
329+
- **Labels**:
330+
- `event_type`: see [description](#metric-name-kms_connector_gw_listener_event_received_counter) (only available for decryption right now!)
331+
- **Description**: Tracks the number of Gateway events not yet processed in the kms-connector's DB.
332+
- **Alarm**: Need more experience with this metric first.
333+
334+
#### Metric Name: `kms_connector_pending_responses`
335+
- **Type**: Gauge
336+
- **Labels**:
337+
- `response_type`: see [description](#metric-name-kms_connector_tx_sender_response_received_counter) (only available for decryption right now!)
338+
- **Description**: Tracks the number of KMS responses not yet sent to the Gateway in the kms-connector's DB.
339+
- **Alarm**: Need more experience with this metric first.

kms-connector/.sqlx/query-05f22646520c5835fc3235aa6378dccd4436ca4a07ab25a9c8abe21760d5b202.json

Lines changed: 20 additions & 0 deletions
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

kms-connector/.sqlx/query-17db0e005e1367157721bf62877d735a9fb755f5d9afe72f069a1872ff1b7fd6.json

Lines changed: 20 additions & 0 deletions
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

kms-connector/.sqlx/query-a70444056f27bd68660c86bade10eccfac9cb5fd677de32dcc7bea043d3ca237.json

Lines changed: 20 additions & 0 deletions
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

kms-connector/.sqlx/query-f4cec78c0611edc0cf747320db7f0fb1c9eaec89303c3f3e45617b155fa9aed8.json

Lines changed: 20 additions & 0 deletions
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

kms-connector/Cargo.lock

Lines changed: 1 addition & 1 deletion
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

kms-connector/Cargo.toml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -18,7 +18,7 @@ gw-listener.path = "crates/gw-listener"
1818
kms-worker.path = "crates/kms-worker"
1919
tx-sender.path = "crates/tx-sender"
2020
connector-utils.path = "crates/utils"
21-
fhevm_gateway_bindings = { git = "https://github.com/zama-ai/fhevm.git", tag = "v0.10.0-2", default-features = false }
21+
fhevm_gateway_bindings = { git = "https://github.com/zama-ai/fhevm.git", tag = "v0.10.0", default-features = false }
2222
kms-grpc = { git = "https://github.com/zama-ai/kms.git", tag = "v0.12.4", default-features = true }
2323
bc2wrap = { git = "https://github.com/zama-ai/kms.git", tag = "v0.12.4", default-features = true }
2424
tfhe = "=1.4.0-alpha.3"

kms-connector/config/tx-sender.toml

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -70,6 +70,10 @@ private_key = "8da4ef21b864d2cc526dbdb2a120bd2874c36c9d0a1fb7f8c63d7f7a8b41de8f"
7070
# ENV: KMS_CONNECTOR_MONITORING_ENDPOINT
7171
# monitoring_endpoint = "0.0.0.0:9100"
7272

73+
# The interval between updates of gauge metrics (optional, defaults to 10s)
74+
# ENV: KMS_CONNECTOR_GAUGE_UPDATE_INTERVAL_SECS
75+
# gauge_update_interval_secs = 10
76+
7377
# The timeout to perform each external service connection healthcheck (optional, defaults to 3s)
7478
# ENV: KMS_CONNECTOR_HEALTHCHECK_TIMEOUT_SECS
7579
# healthcheck_timeout_secs = 3

kms-connector/crates/gw-listener/src/core/gw_listener.rs

Lines changed: 8 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -3,7 +3,7 @@ use crate::{
33
core::{publish::update_last_block_polled, publish_event},
44
monitoring::{
55
health::State,
6-
metrics::{EVENT_RECEIVED_COUNTER, EVENT_RECEIVED_ERRORS, EVENT_STORAGE_ERRORS},
6+
metrics::{EVENT_RECEIVED_COUNTER, EVENT_RECEIVED_ERRORS},
77
},
88
};
99
use alloy::{
@@ -198,14 +198,19 @@ where
198198
match events.next().await {
199199
Some(Ok((event, log))) => {
200200
*last_block = log.block_number;
201-
EVENT_RECEIVED_COUNTER.inc();
201+
EVENT_RECEIVED_COUNTER
202+
.with_label_values(&[event_type.as_str()])
203+
.inc();
204+
202205
let db = self.db_pool.clone();
203206
spawn_with_limit(handle_gateway_event(db, event.into(), log.block_number))
204207
.await;
205208
}
206209
Some(Err(err)) => {
207210
error!("Error while listening for {event_type} events: {err}");
208-
EVENT_RECEIVED_ERRORS.inc();
211+
EVENT_RECEIVED_ERRORS
212+
.with_label_values(&[event_type.as_str()])
213+
.inc();
209214
continue;
210215
}
211216
None => break error!("Alloy Provider was dropped for {event_type}"),
@@ -274,7 +279,6 @@ async fn handle_gateway_event(
274279
);
275280
if let Err(err) = publish_event(&db_pool, event, block_number).await {
276281
error!("Failed to publish event: {err}");
277-
EVENT_STORAGE_ERRORS.inc();
278282
}
279283
}
280284

kms-connector/crates/gw-listener/src/core/publish.rs

Lines changed: 0 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,3 @@
1-
use crate::monitoring::metrics::EVENT_STORED_COUNTER;
21
use alloy::primitives::U256;
32
use anyhow::anyhow;
43
use connector_utils::{
@@ -44,7 +43,6 @@ pub async fn publish_event(
4443

4544
if query_result.rows_affected() == 1 {
4645
info!("Event successfully stored in DB!");
47-
EVENT_STORED_COUNTER.inc();
4846
} else {
4947
warn!("Unexpected query result while publishing event: {query_result:?}");
5048
}

0 commit comments

Comments
 (0)