Skip to content

Commit 070af99

Browse files
authored
feat: system and resource metrics (#37)
1 parent aa7e4eb commit 070af99

File tree

3 files changed

+228
-3
lines changed

3 files changed

+228
-3
lines changed

go.mod

+1-1
Original file line numberDiff line numberDiff line change
@@ -70,7 +70,7 @@ require (
7070
github.com/mitchellh/mapstructure v1.4.2 // indirect
7171
github.com/pierrec/xxHash v0.1.5 // indirect
7272
github.com/pmezard/go-difflib v1.0.0 // indirect
73-
github.com/shirou/gopsutil v3.21.4-0.20210419000835-c7a38de76ee5+incompatible // indirect
73+
github.com/shirou/gopsutil v3.21.4-0.20210419000835-c7a38de76ee5+incompatible
7474
github.com/tklauser/go-sysconf v0.3.12 // indirect
7575
github.com/tklauser/numcpus v0.6.1 // indirect
7676
go.opentelemetry.io/otel/sdk v1.16.0

observability/metrics.go

+12-2
Original file line numberDiff line numberDiff line change
@@ -7,6 +7,7 @@ import (
77
"sync"
88
"time"
99

10+
"github.com/sygmaprotocol/sygma-core/observability/metrics"
1011
"go.opentelemetry.io/otel/attribute"
1112
"go.opentelemetry.io/otel/exporters/otlp/otlpmetric/otlpmetrichttp"
1213
"go.opentelemetry.io/otel/metric"
@@ -58,6 +59,8 @@ func InitMetricProvider(ctx context.Context, agentURL string) (*sdkmetric.MeterP
5859
}
5960

6061
type RelayerMetrics struct {
62+
*metrics.SystemMetrics
63+
6164
meter metric.Meter
6265
Opts api.MeasurementOption
6366

@@ -73,11 +76,11 @@ type RelayerMetrics struct {
7376
}
7477

7578
// NewRelayerMetrics initializes OpenTelemetry metrics
76-
func NewRelayerMetrics(meter metric.Meter, attributes ...attribute.KeyValue) (*RelayerMetrics, error) {
79+
func NewRelayerMetrics(ctx context.Context, meter metric.Meter, attributes ...attribute.KeyValue) (*RelayerMetrics, error) {
7780
opts := api.WithAttributes(attributes...)
7881

7982
blockDeltaMap := make(map[uint8]*big.Int)
80-
blockDeltaGauge, err := meter.Int64ObservableGauge(
83+
blockDeltaGauge, _ := meter.Int64ObservableGauge(
8184
"relayer.BlockDelta",
8285
metric.WithInt64Callback(func(context context.Context, result metric.Int64Observer) error {
8386
for domainID, delta := range blockDeltaMap {
@@ -90,7 +93,14 @@ func NewRelayerMetrics(meter metric.Meter, attributes ...attribute.KeyValue) (*R
9093
}),
9194
metric.WithDescription("Difference between chain head and current indexed block per domain"),
9295
)
96+
97+
systemMetrics, err := metrics.NewSystemMetrics(ctx, meter, opts)
98+
if err != nil {
99+
return nil, err
100+
}
101+
93102
return &RelayerMetrics{
103+
SystemMetrics: systemMetrics,
94104
meter: meter,
95105
MessageEventTime: make(map[string]time.Time),
96106
Opts: opts,

observability/metrics/system.go

+215
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,215 @@
1+
package metrics
2+
3+
import (
4+
"context"
5+
"runtime"
6+
"runtime/debug"
7+
"time"
8+
9+
"github.com/shirou/gopsutil/cpu"
10+
"github.com/shirou/gopsutil/disk"
11+
"github.com/shirou/gopsutil/mem"
12+
"github.com/shirou/gopsutil/net"
13+
"go.opentelemetry.io/otel/metric"
14+
)
15+
16+
const (
17+
GC_STATS_UPDATE_PERIOD = time.Second * 10
18+
)
19+
20+
type SystemMetrics struct {
21+
opts metric.MeasurementOption
22+
23+
goRoutinesGauge metric.Int64ObservableGauge
24+
totalMemoryGauge metric.Int64ObservableGauge
25+
usedMemoryGauge metric.Int64ObservableGauge
26+
cpuUsageGauge metric.Float64ObservableGauge
27+
gcDurationHistogram metric.Float64Histogram
28+
diskUsageGauge metric.Int64ObservableGauge
29+
totalDiskGauge metric.Int64ObservableGauge
30+
networkIOReceivedGauge metric.Int64ObservableGauge
31+
networkIOSentGauge metric.Int64ObservableGauge
32+
}
33+
34+
// NewSystemMetrics initializes system performance and resource utilization metrics
35+
func NewSystemMetrics(ctx context.Context, meter metric.Meter, opts metric.MeasurementOption) (*SystemMetrics, error) {
36+
goRoutinesGauge, err := meter.Int64ObservableGauge(
37+
"relayer.GoRoutines",
38+
metric.WithInt64Callback(func(context context.Context, result metric.Int64Observer) error {
39+
result.Observe(int64(runtime.NumGoroutine()), opts)
40+
return nil
41+
}),
42+
metric.WithDescription("Number of Go routines running."),
43+
)
44+
if err != nil {
45+
return nil, err
46+
}
47+
48+
usedMemoryGauge, err := meter.Int64ObservableGauge(
49+
"relayer.MemoryUsageBytes",
50+
metric.WithInt64Callback(func(context context.Context, result metric.Int64Observer) error {
51+
v, err := mem.VirtualMemory()
52+
if err != nil {
53+
return err
54+
}
55+
56+
result.Observe(int64(v.Used), opts)
57+
return nil
58+
}),
59+
metric.WithDescription("Memory usage in bytes."),
60+
)
61+
if err != nil {
62+
return nil, err
63+
}
64+
totalMemoryGauge, err := meter.Int64ObservableGauge(
65+
"relayer.TotalMemoryBytes",
66+
metric.WithInt64Callback(func(context context.Context, result metric.Int64Observer) error {
67+
v, err := mem.VirtualMemory()
68+
if err != nil {
69+
return err
70+
}
71+
72+
result.Observe(int64(v.Total), opts)
73+
return nil
74+
}),
75+
metric.WithDescription("Total memory in bytes."),
76+
)
77+
if err != nil {
78+
return nil, err
79+
}
80+
81+
cpuUsageGauge, err := meter.Float64ObservableGauge(
82+
"relayer.CpuUsagePercent",
83+
metric.WithFloat64Callback(func(context context.Context, result metric.Float64Observer) error {
84+
percents, err := cpu.Percent(0, false)
85+
if err != nil {
86+
return err
87+
}
88+
89+
result.Observe(percents[0], opts)
90+
return nil
91+
}),
92+
metric.WithDescription("CPU usage percent."),
93+
)
94+
if err != nil {
95+
return nil, err
96+
}
97+
98+
diskUsageGauge, err := meter.Int64ObservableGauge(
99+
"relayer.DiskUsageBytes",
100+
metric.WithInt64Callback(func(context context.Context, result metric.Int64Observer) error {
101+
usage, err := disk.Usage("/")
102+
if err != nil {
103+
return err
104+
}
105+
106+
result.Observe(int64(usage.Used), opts)
107+
return nil
108+
}),
109+
metric.WithDescription("Disk space used by the relayer in bytes."),
110+
)
111+
if err != nil {
112+
return nil, err
113+
}
114+
totalDiskGauge, err := meter.Int64ObservableGauge(
115+
"relayer.TotalDiskBytes",
116+
metric.WithInt64Callback(func(context context.Context, result metric.Int64Observer) error {
117+
usage, err := disk.Usage("/")
118+
if err != nil {
119+
return err
120+
}
121+
122+
result.Observe(int64(usage.Total), opts)
123+
return nil
124+
}),
125+
metric.WithDescription("Total relayer disk space."),
126+
)
127+
if err != nil {
128+
return nil, err
129+
}
130+
131+
networkIOReceivedGauge, err := meter.Int64ObservableGauge(
132+
"relayer.NetworkIOBytesReceived",
133+
metric.WithInt64Callback(func(context context.Context, result metric.Int64Observer) error {
134+
stat, err := net.IOCounters(false)
135+
if err != nil {
136+
return err
137+
}
138+
139+
result.Observe(int64(stat[0].BytesRecv), opts)
140+
return nil
141+
}),
142+
metric.WithDescription("Total network bytes received."),
143+
)
144+
if err != nil {
145+
return nil, err
146+
}
147+
networkIOSentGauge, err := meter.Int64ObservableGauge(
148+
"relayer.NetworkIOBytesSent",
149+
metric.WithInt64Callback(func(context context.Context, result metric.Int64Observer) error {
150+
stat, err := net.IOCounters(false)
151+
if err != nil {
152+
return err
153+
}
154+
155+
result.Observe(int64(stat[0].BytesSent), opts)
156+
return nil
157+
}),
158+
metric.WithDescription("Total network bytes sent."),
159+
)
160+
if err != nil {
161+
return nil, err
162+
}
163+
164+
gcDurationHistogram, err := meter.Float64Histogram(
165+
"relayer.GcDurationSeconds",
166+
metric.WithDescription("Duration of garbage collection cycles."),
167+
)
168+
if err != nil {
169+
return nil, err
170+
}
171+
172+
m := &SystemMetrics{
173+
opts: opts,
174+
goRoutinesGauge: goRoutinesGauge,
175+
totalMemoryGauge: totalMemoryGauge,
176+
usedMemoryGauge: usedMemoryGauge,
177+
gcDurationHistogram: gcDurationHistogram,
178+
cpuUsageGauge: cpuUsageGauge,
179+
totalDiskGauge: totalDiskGauge,
180+
diskUsageGauge: diskUsageGauge,
181+
networkIOReceivedGauge: networkIOReceivedGauge,
182+
networkIOSentGauge: networkIOSentGauge,
183+
}
184+
185+
go m.updateGCStats(ctx)
186+
return m, err
187+
}
188+
189+
func (m *SystemMetrics) updateGCStats(ctx context.Context) {
190+
ticker := time.NewTicker(GC_STATS_UPDATE_PERIOD)
191+
var previousPauseDuration float64
192+
for {
193+
select {
194+
case <-ticker.C:
195+
{
196+
var gcStats debug.GCStats
197+
debug.ReadGCStats(&gcStats)
198+
if len(gcStats.Pause) == 0 {
199+
continue
200+
}
201+
202+
recentPauseDuration := gcStats.Pause[0].Seconds()
203+
if recentPauseDuration == previousPauseDuration {
204+
continue
205+
}
206+
207+
m.gcDurationHistogram.Record(context.Background(), recentPauseDuration, m.opts)
208+
previousPauseDuration = recentPauseDuration
209+
}
210+
case <-ctx.Done():
211+
return
212+
213+
}
214+
}
215+
}

0 commit comments

Comments
 (0)