Skip to content

Commit 6c3c08f

Browse files
tanut32039Tanut Lertwarachai
andauthored
Improve telemetry (#38)
* fix telemetry * fix comment * add doc * fix comment * add function name coment * fix md * fix metric logic * change to sw case * fix comment --------- Co-authored-by: Tanut Lertwarachai <tanutlertwarachai@Tanuts-MacBook-Pro.local>
1 parent c2eb10a commit 6c3c08f

13 files changed

Lines changed: 194 additions & 106 deletions

File tree

docs/telemetry/README.md

Lines changed: 39 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,39 @@
1+
# Falcon Telemetry
2+
3+
## Metrics Tracked
4+
5+
The following telemetry metrics are captured and exposed via Prometheus:
6+
7+
### Packet Relaying Metrics
8+
9+
- `falcon_packets_relayed_success` (Counter): Total number of packets successfully relayed from BandChain
10+
11+
- `falcon_unrelayed_packets` (Gauge): Number of unrelayed packets (the difference between total packets from BandChain and received packets from the target chain)
12+
13+
### Task Execution Metrics
14+
15+
- `falcon_tasks_count` (Counter): Total number of executed tasks
16+
- `falcon_finished_task_execution_time` (Summary): Execution time (ms) for finished tasks
17+
18+
### Tunnel and Contract Metrics
19+
- `falcon_tunnels_per_destination_chain` (Counter): Total number of tunnels per destination chain
20+
21+
- `falcon_active_target_contracts_count` (Gauge): Number of active target chain contracts
22+
23+
### Transaction Metrics
24+
- `falcon_txs_count` (Counter): Total number of transactions
25+
26+
- `falcon_tx_process_time` (Summary): Processing time (ms) for transaction
27+
28+
- `falcon_gas_used` (Summary): Amount of gas used per transaction
29+
30+
## Grafana Dashboard
31+
Grafana provides a pre-built Falcon Dashboard to visualize relay metrics efficiently. You can download and import the dashboard from Grafana's official repository.
32+
33+
34+
- Dashboard Link: [Falcon Grafana Dashboard](https://grafana.com/grafana/dashboards/23038-falcon/)
35+
36+
### Example Screenshot:
37+
![Packets Relayed Panel](example-packets-relayed.png)
38+
39+
![Destination Chain Panel](example-destination-chain.png)
296 KB
Loading
237 KB
Loading

internal/relayermetrics/metrics.go

Lines changed: 61 additions & 43 deletions
Original file line numberDiff line numberDiff line change
@@ -20,13 +20,20 @@ var metrics *PrometheusMetrics
2020
// It is set on initialization and does not change for the lifetime of the program.
2121
var globalTelemetryEnabled bool
2222

23+
// Task statuses used as labels
24+
const (
25+
FinishedTaskStatus = "finished"
26+
ErrorTaskStatus = "error"
27+
SkippedTaskStatus = "skipped"
28+
)
29+
2330
type PrometheusMetrics struct {
2431
PacketsRelayedSuccess *prometheus.CounterVec
2532
UnrelayedPackets *prometheus.GaugeVec
2633
TasksCount *prometheus.CounterVec
27-
TaskExecutionTime *prometheus.SummaryVec
34+
FinishedTaskExecutionTime *prometheus.SummaryVec
2835
TunnelsPerDestinationChain *prometheus.CounterVec
29-
ActiveTargetContractsCount prometheus.Gauge
36+
ActiveTargetContractsCount *prometheus.GaugeVec
3037
TxsCount *prometheus.CounterVec
3138
TxProcessTime *prometheus.SummaryVec
3239
GasUsed *prometheus.SummaryVec
@@ -38,82 +45,93 @@ func updateMetrics(updateFn func()) {
3845
}
3946
}
4047

41-
// IncPacketsRelayedSuccess increments the count of successfully relayed packets for a specific tunnel.
48+
// IncPacketsRelayedSuccess increments the count of successfully relayed packets.
4249
func IncPacketsRelayedSuccess(tunnelID uint64) {
4350
updateMetrics(func() {
4451
metrics.PacketsRelayedSuccess.WithLabelValues(fmt.Sprintf("%d", tunnelID)).Inc()
4552
})
4653
}
4754

48-
// SetUnrelayedPackets sets the number of unrelayed packets for a specific tunnel.
49-
func SetUnrelayedPackets(tunnelID uint64, unrelayedPackets float64) {
55+
// SetUnrelayedPackets sets the number of unrelayed packets.
56+
func SetUnrelayedPackets(tunnelID uint64, unrelayedPackets uint64) {
5057
updateMetrics(func() {
51-
metrics.UnrelayedPackets.WithLabelValues(fmt.Sprintf("%d", tunnelID)).Set(unrelayedPackets)
58+
metrics.UnrelayedPackets.WithLabelValues(fmt.Sprintf("%d", tunnelID)).Set(float64(unrelayedPackets))
5259
})
5360
}
5461

55-
// IncTasksCount increments the total tasks count for a specific tunnel.
56-
func IncTasksCount(tunnelID uint64) {
62+
// IncTasksCount increments the total count of executed tasks.
63+
func IncTasksCount(tunnelID uint64, destinationChain string, taskStatus string) {
5764
updateMetrics(func() {
58-
metrics.TasksCount.WithLabelValues(fmt.Sprintf("%d", tunnelID)).Inc()
65+
metrics.TasksCount.WithLabelValues(fmt.Sprintf("%d", tunnelID), destinationChain, taskStatus).Inc()
5966
})
6067
}
6168

62-
// ObserveTaskExecutionTime records the execution time of a task for a specific tunnel.
63-
func ObserveTaskExecutionTime(tunnelID uint64, taskExecutionTime float64) {
69+
// ObserveFinishedTaskExecutionTime records the execution time (ms) of a finished task.
70+
func ObserveFinishedTaskExecutionTime(
71+
tunnelID uint64,
72+
destinationChain string,
73+
finishedTaskExecutionTime int64,
74+
) {
6475
updateMetrics(func() {
65-
metrics.TaskExecutionTime.WithLabelValues(fmt.Sprintf("%d", tunnelID)).Observe(taskExecutionTime)
76+
metrics.FinishedTaskExecutionTime.WithLabelValues(fmt.Sprintf("%d", tunnelID), destinationChain).
77+
Observe(float64(finishedTaskExecutionTime))
6678
})
6779
}
6880

69-
// IncTunnelsPerDestinationChain increments the total number of tunnels per specific destination chain.
81+
// IncTunnelsPerDestinationChain increments the count of tunnels per destination chain.
7082
func IncTunnelsPerDestinationChain(destinationChain string) {
7183
updateMetrics(func() {
7284
metrics.TunnelsPerDestinationChain.WithLabelValues(destinationChain).Inc()
7385
})
7486
}
7587

7688
// IncActiveTargetContractsCount increases the count of active target contracts.
77-
func IncActiveTargetContractsCount() {
89+
func IncActiveTargetContractsCount(destinationChain string) {
7890
updateMetrics(func() {
79-
metrics.ActiveTargetContractsCount.Inc()
91+
metrics.ActiveTargetContractsCount.WithLabelValues(destinationChain).Inc()
8092
})
8193
}
8294

8395
// DecActiveTargetContractsCount decreases the count of active target contracts.
84-
func DecActiveTargetContractsCount() {
96+
func DecActiveTargetContractsCount(destinationChain string) {
8597
updateMetrics(func() {
86-
metrics.ActiveTargetContractsCount.Dec()
98+
metrics.ActiveTargetContractsCount.WithLabelValues(destinationChain).Dec()
8799
})
88100
}
89101

90-
// IncTxsCount increments the transactions count metric for a specific tunnel.
91-
func IncTxsCount(tunnelID uint64) {
102+
// IncTxsCount increments the transactions count.
103+
func IncTxsCount(tunnelID uint64, destinationChain string, txStatus string) {
92104
updateMetrics(func() {
93-
metrics.TxsCount.WithLabelValues(fmt.Sprintf("%d", tunnelID)).Inc()
105+
metrics.TxsCount.WithLabelValues(fmt.Sprintf("%d", tunnelID), destinationChain, txStatus).Inc()
94106
})
95107
}
96108

97-
// ObserveTxProcessTime tracks transaction processing time in seconds with millisecond precision.
98-
func ObserveTxProcessTime(destinationChain string, txProcessTime float64) {
109+
// ObserveTxProcessTime records the processing time (ms) for each transaction.
110+
func ObserveTxProcessTime(tunnelID uint64, destinationChain string, txStatus string, txProcessTime int64) {
99111
updateMetrics(func() {
100-
metrics.TxProcessTime.WithLabelValues(destinationChain).Observe(txProcessTime)
112+
metrics.TxProcessTime.WithLabelValues(fmt.Sprintf("%d", tunnelID), destinationChain, txStatus).
113+
Observe(float64(txProcessTime))
101114
})
102115
}
103116

104-
// ObserveGasUsed tracks gas used for the each relayed transaction.
105-
func ObserveGasUsed(tunnelID uint64, gasUsed uint64) {
117+
// ObserveGasUsed tracks the amount of gas used for each transaction.
118+
func ObserveGasUsed(tunnelID uint64, destinationChain string, txStatus string, gasUsed float64) {
106119
updateMetrics(func() {
107-
metrics.GasUsed.WithLabelValues(fmt.Sprintf("%d", tunnelID)).Observe(float64(gasUsed))
120+
metrics.GasUsed.WithLabelValues(fmt.Sprintf("%d", tunnelID), destinationChain, txStatus).
121+
Observe(gasUsed)
108122
})
109123
}
110124

111125
func InitPrometheusMetrics() {
112126
packetLabels := []string{"tunnel_id"}
113-
taskLabels := []string{"tunnel_id"}
127+
tasksCountLabels := []string{"tunnel_id", "destination_chain", "task_status"}
128+
finishedTaskExecutionTimeLabels := []string{"tunnel_id", "destination_chain"}
114129
tunnelPerDestinationChainLabels := []string{"destination_chain"}
115-
txLabels := []string{"tunnel_id"}
116-
gasUsedLabels := []string{"tunnel_id"}
130+
activeTargetContractsLabels := []string{"destination_chain"}
131+
txsCountLabels := []string{"tunnel_id", "destination_chain", "tx_status"}
132+
txProcessTimeLabels := []string{"tunnel_id", "destination_chain", "tx_status"}
133+
134+
gasUsedLabels := []string{"tunnel_id", "destination_chain", "tx_status"}
117135

118136
metrics = &PrometheusMetrics{
119137
PacketsRelayedSuccess: promauto.NewCounterVec(prometheus.CounterOpts{
@@ -126,38 +144,38 @@ func InitPrometheusMetrics() {
126144
}, packetLabels),
127145
TasksCount: promauto.NewCounterVec(prometheus.CounterOpts{
128146
Name: "falcon_tasks_count",
129-
Help: "Total number of successfully executed tasks",
130-
}, taskLabels),
131-
TaskExecutionTime: promauto.NewSummaryVec(prometheus.SummaryOpts{
132-
Name: "falcon_task_execution_time",
133-
Help: "Task execution time in milliseconds",
147+
Help: "Total number of executed tasks",
148+
}, tasksCountLabels),
149+
FinishedTaskExecutionTime: promauto.NewSummaryVec(prometheus.SummaryOpts{
150+
Name: "falcon_finished_task_execution_time",
151+
Help: "Execution time (ms) for finished tasks",
134152
Objectives: map[float64]float64{
135153
0.5: 0.05,
136154
0.9: 0.01,
137155
0.99: 0.001,
138156
},
139-
}, taskLabels),
157+
}, finishedTaskExecutionTimeLabels),
140158
TunnelsPerDestinationChain: promauto.NewCounterVec(prometheus.CounterOpts{
141159
Name: "falcon_tunnels_per_destination_chain",
142-
Help: "Total number of destination chains",
160+
Help: "Total number of tunnels per destination chain",
143161
}, tunnelPerDestinationChainLabels),
144-
ActiveTargetContractsCount: promauto.NewGauge(prometheus.GaugeOpts{
162+
ActiveTargetContractsCount: promauto.NewGaugeVec(prometheus.GaugeOpts{
145163
Name: "falcon_active_target_contracts_count",
146164
Help: "Number of active target chain contracts",
147-
}),
165+
}, activeTargetContractsLabels),
148166
TxsCount: promauto.NewCounterVec(prometheus.CounterOpts{
149167
Name: "falcon_txs_count",
150-
Help: "Total number of transactions per tunnel",
151-
}, txLabels),
168+
Help: "Total number of transactions",
169+
}, txsCountLabels),
152170
TxProcessTime: promauto.NewSummaryVec(prometheus.SummaryOpts{
153171
Name: "falcon_tx_process_time",
154-
Help: "Transaction processing time in milliseconds",
172+
Help: "Processing time (ms) for transaction",
155173
Objectives: map[float64]float64{
156174
0.5: 0.05,
157175
0.9: 0.01,
158176
0.99: 0.001,
159177
},
160-
}, txLabels),
178+
}, txProcessTimeLabels),
161179
GasUsed: promauto.NewSummaryVec(prometheus.SummaryOpts{
162180
Name: "falcon_gas_used",
163181
Help: "Amount of gas used per transaction",
@@ -178,7 +196,7 @@ func StartMetricsServer(ctx context.Context, log *zap.Logger, metricsListenAddr
178196
ln, err := net.Listen("tcp", metricsListenAddr)
179197
if err != nil {
180198
log.Error(
181-
"Failed to start metrics server you can change the address and port using metrics-listen-addr config setting or --metrics-listen-flag",
199+
"Failed to start metrics server you can change the address and port using metrics-listen-addr config setting or --metrics-listen-addr flag",
182200
)
183201

184202
return fmt.Errorf("failed to listen on metrics address %q: %w", metricsListenAddr, err)

internal/relayertest/mocks/chain_provider.go

Lines changed: 14 additions & 0 deletions
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

relayer/app.go

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -386,7 +386,6 @@ func (a *App) Relay(ctx context.Context, tunnelID uint64) error {
386386
tr := NewTunnelRelayer(
387387
a.Log,
388388
tunnel.ID,
389-
tunnel.TargetAddress,
390389
a.Config.Global.CheckingPacketInterval,
391390
a.BandClient,
392391
chainProvider,

relayer/chains/evm/provider.go

Lines changed: 32 additions & 16 deletions
Original file line numberDiff line numberDiff line change
@@ -182,9 +182,6 @@ func (cp *EVMChainProvider) RelayPacket(ctx context.Context, packet *bandtypes.P
182182

183183
createdAt := time.Now()
184184

185-
// increment the transactions count metric for the current tunnel
186-
relayermetrics.IncTxsCount(packet.TunnelID)
187-
188185
log.Info(
189186
"Submitted a message; checking transaction status",
190187
zap.String("tx_hash", txHash),
@@ -193,6 +190,7 @@ func (cp *EVMChainProvider) RelayPacket(ctx context.Context, packet *bandtypes.P
193190

194191
var checkTxErr error
195192
var txStatus TxStatus
193+
var gasUsed decimal.NullDecimal
196194
checkTxLogic:
197195
for time.Since(createdAt) < cp.Config.WaitingTxDuration {
198196
result, err := cp.CheckConfirmedTx(ctx, txHash)
@@ -212,13 +210,18 @@ func (cp *EVMChainProvider) RelayPacket(ctx context.Context, packet *bandtypes.P
212210

213211
checkTxErr = nil
214212
txStatus = result.Status
213+
gasUsed = result.GasUsed
214+
215215
switch result.Status {
216216
case TX_STATUS_SUCCESS:
217-
// track transaction processing time in seconds with millisecond precision
218-
relayermetrics.ObserveTxProcessTime(cp.ChainName, float64(time.Since(createdAt).Milliseconds()))
217+
// increment the transactions count metric
218+
relayermetrics.IncTxsCount(packet.TunnelID, cp.ChainName, TX_STATUS_SUCCESS.String())
219+
220+
// track transaction processing time (ms)
221+
relayermetrics.ObserveTxProcessTime(packet.TunnelID, cp.ChainName, TX_STATUS_SUCCESS.String(), time.Since(createdAt).Milliseconds())
219222

220223
// track gas used for the relayed transaction
221-
relayermetrics.ObserveGasUsed(packet.TunnelID, result.GasUsed.Decimal.BigInt().Uint64())
224+
relayermetrics.ObserveGasUsed(packet.TunnelID, cp.ChainName, TX_STATUS_SUCCESS.String(), gasUsed.Decimal.InexactFloat64())
222225

223226
log.Info(
224227
"Packet is successfully relayed",
@@ -227,6 +230,16 @@ func (cp *EVMChainProvider) RelayPacket(ctx context.Context, packet *bandtypes.P
227230
)
228231
return nil
229232
case TX_STATUS_FAILED:
233+
// track transaction processing time (ms)
234+
relayermetrics.ObserveTxProcessTime(
235+
packet.TunnelID,
236+
cp.ChainName,
237+
TX_STATUS_FAILED.String(),
238+
time.Since(createdAt).Milliseconds(),
239+
)
240+
241+
// track gas used for the relayed transaction
242+
relayermetrics.ObserveGasUsed(packet.TunnelID, cp.ChainName, TX_STATUS_FAILED.String(), gasUsed.Decimal.InexactFloat64())
230243
log.Debug(
231244
"Transaction failed during relay attempt",
232245
zap.Error(err),
@@ -246,6 +259,9 @@ func (cp *EVMChainProvider) RelayPacket(ctx context.Context, packet *bandtypes.P
246259
}
247260
}
248261

262+
// increment the transactions count metric
263+
relayermetrics.IncTxsCount(packet.TunnelID, cp.ChainName, txStatus.String())
264+
249265
log.Error(
250266
"Failed to relaying a packet with status and error",
251267
zap.Error(checkTxErr),
@@ -296,19 +312,16 @@ func (cp *EVMChainProvider) CheckConfirmedTx(
296312
ctx context.Context,
297313
txHash string,
298314
) (*ConfirmTxResult, error) {
299-
failResult := NewConfirmTxResult(
300-
txHash,
301-
TX_STATUS_UNMINED,
302-
decimal.NullDecimal{},
303-
)
304-
305315
receipt, err := cp.Client.GetTxReceipt(ctx, txHash)
306316
if err != nil {
307317
return nil, fmt.Errorf("failed to get tx receipt: %w", err)
308318
}
309319

320+
// calculate gas used and effective gas price
321+
gasUsed := decimal.NewNullDecimal(decimal.New(int64(receipt.GasUsed), 0))
322+
310323
if receipt.Status == gethtypes.ReceiptStatusFailed {
311-
return failResult.WithStatus(TX_STATUS_FAILED), nil
324+
return NewConfirmTxResult(txHash, TX_STATUS_FAILED, gasUsed), nil
312325
}
313326

314327
latestBlock, err := cp.Client.GetBlockHeight(ctx)
@@ -318,11 +331,9 @@ func (cp *EVMChainProvider) CheckConfirmedTx(
318331

319332
// if tx block is not confirmed and waiting too long return status with timeout
320333
if receipt.BlockNumber.Uint64() > latestBlock-cp.Config.BlockConfirmation {
321-
return failResult.WithStatus(TX_STATUS_UNMINED), nil
334+
return NewConfirmTxResult(txHash, TX_STATUS_UNMINED, decimal.NullDecimal{}), nil
322335
}
323336

324-
// calculate gas used and effective gas price
325-
gasUsed := decimal.NewNullDecimal(decimal.New(int64(receipt.GasUsed), 0))
326337
return NewConfirmTxResult(txHash, TX_STATUS_SUCCESS, gasUsed), nil
327338
}
328339

@@ -574,6 +585,11 @@ func (cp *EVMChainProvider) QueryBalance(
574585
return cp.Client.GetBalance(ctx, address)
575586
}
576587

588+
// GetChainName retrieves the chain name from the chain provider.
589+
func (cp *EVMChainProvider) GetChainName() string {
590+
return cp.ChainName
591+
}
592+
577593
// queryRelayerGasFee queries the relayer gas fee being set on tunnel router.
578594
func (cp *EVMChainProvider) queryRelayerGasFee(ctx context.Context) (*big.Int, error) {
579595
calldata, err := cp.TunnelRouterABI.Pack("gasFee")

relayer/chains/evm/provider_test.go

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -259,7 +259,7 @@ func (s *ProviderTestSuite) TestCheckConfirmedTx() {
259259
out: evm.NewConfirmTxResult(
260260
txHash,
261261
evm.TX_STATUS_FAILED,
262-
decimal.NullDecimal{},
262+
decimal.NewNullDecimal(decimal.New(21000, 0)),
263263
),
264264
},
265265
{

0 commit comments

Comments
 (0)