Skip to content

feat: system and resource metrics #37

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 6 commits into from
Oct 23, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion go.mod
Original file line number Diff line number Diff line change
Expand Up @@ -70,7 +70,7 @@ require (
github.com/mitchellh/mapstructure v1.4.2 // indirect
github.com/pierrec/xxHash v0.1.5 // indirect
github.com/pmezard/go-difflib v1.0.0 // indirect
github.com/shirou/gopsutil v3.21.4-0.20210419000835-c7a38de76ee5+incompatible // indirect
github.com/shirou/gopsutil v3.21.4-0.20210419000835-c7a38de76ee5+incompatible
github.com/tklauser/go-sysconf v0.3.12 // indirect
github.com/tklauser/numcpus v0.6.1 // indirect
go.opentelemetry.io/otel/sdk v1.16.0
Expand Down
14 changes: 12 additions & 2 deletions observability/metrics.go
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,7 @@ import (
"sync"
"time"

"github.com/sygmaprotocol/sygma-core/observability/metrics"
"go.opentelemetry.io/otel/attribute"
"go.opentelemetry.io/otel/exporters/otlp/otlpmetric/otlpmetrichttp"
"go.opentelemetry.io/otel/metric"
Expand Down Expand Up @@ -58,6 +59,8 @@ func InitMetricProvider(ctx context.Context, agentURL string) (*sdkmetric.MeterP
}

type RelayerMetrics struct {
*metrics.SystemMetrics

meter metric.Meter
Opts api.MeasurementOption

Expand All @@ -73,11 +76,11 @@ type RelayerMetrics struct {
}

// NewRelayerMetrics initializes OpenTelemetry metrics
func NewRelayerMetrics(meter metric.Meter, attributes ...attribute.KeyValue) (*RelayerMetrics, error) {
func NewRelayerMetrics(ctx context.Context, meter metric.Meter, attributes ...attribute.KeyValue) (*RelayerMetrics, error) {
opts := api.WithAttributes(attributes...)

blockDeltaMap := make(map[uint8]*big.Int)
blockDeltaGauge, err := meter.Int64ObservableGauge(
blockDeltaGauge, _ := meter.Int64ObservableGauge(
"relayer.BlockDelta",
metric.WithInt64Callback(func(context context.Context, result metric.Int64Observer) error {
for domainID, delta := range blockDeltaMap {
Expand All @@ -90,7 +93,14 @@ func NewRelayerMetrics(meter metric.Meter, attributes ...attribute.KeyValue) (*R
}),
metric.WithDescription("Difference between chain head and current indexed block per domain"),
)

systemMetrics, err := metrics.NewSystemMetrics(ctx, meter, opts)
if err != nil {
return nil, err
}

return &RelayerMetrics{
SystemMetrics: systemMetrics,
meter: meter,
MessageEventTime: make(map[string]time.Time),
Opts: opts,
Expand Down
215 changes: 215 additions & 0 deletions observability/metrics/system.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,215 @@
package metrics

import (
"context"
"runtime"
"runtime/debug"
"time"

"github.com/shirou/gopsutil/cpu"
"github.com/shirou/gopsutil/disk"
"github.com/shirou/gopsutil/mem"
"github.com/shirou/gopsutil/net"
"go.opentelemetry.io/otel/metric"
)

const (
GC_STATS_UPDATE_PERIOD = time.Second * 10
)

type SystemMetrics struct {
opts metric.MeasurementOption

goRoutinesGauge metric.Int64ObservableGauge
totalMemoryGauge metric.Int64ObservableGauge
usedMemoryGauge metric.Int64ObservableGauge
cpuUsageGauge metric.Float64ObservableGauge
gcDurationHistogram metric.Float64Histogram
diskUsageGauge metric.Int64ObservableGauge
totalDiskGauge metric.Int64ObservableGauge
networkIOReceivedGauge metric.Int64ObservableGauge
networkIOSentGauge metric.Int64ObservableGauge
}

// NewSystemMetrics initializes system performance and resource utilization metrics
func NewSystemMetrics(ctx context.Context, meter metric.Meter, opts metric.MeasurementOption) (*SystemMetrics, error) {
goRoutinesGauge, err := meter.Int64ObservableGauge(
"relayer.GoRoutines",
metric.WithInt64Callback(func(context context.Context, result metric.Int64Observer) error {
result.Observe(int64(runtime.NumGoroutine()), opts)
return nil
}),
metric.WithDescription("Number of Go routines running."),
)
if err != nil {
return nil, err
}

usedMemoryGauge, err := meter.Int64ObservableGauge(
"relayer.MemoryUsageBytes",
metric.WithInt64Callback(func(context context.Context, result metric.Int64Observer) error {
v, err := mem.VirtualMemory()
if err != nil {
return err
}

result.Observe(int64(v.Used), opts)
return nil
}),
metric.WithDescription("Memory usage in bytes."),
)
if err != nil {
return nil, err
}
totalMemoryGauge, err := meter.Int64ObservableGauge(
"relayer.TotalMemoryBytes",
metric.WithInt64Callback(func(context context.Context, result metric.Int64Observer) error {
v, err := mem.VirtualMemory()
if err != nil {
return err
}

result.Observe(int64(v.Total), opts)
return nil
}),
metric.WithDescription("Total memory in bytes."),
)
if err != nil {
return nil, err
}

cpuUsageGauge, err := meter.Float64ObservableGauge(
"relayer.CpuUsagePercent",
metric.WithFloat64Callback(func(context context.Context, result metric.Float64Observer) error {
percents, err := cpu.Percent(0, false)
if err != nil {
return err
}

result.Observe(percents[0], opts)
return nil
}),
metric.WithDescription("CPU usage percent."),
)
if err != nil {
return nil, err
}

diskUsageGauge, err := meter.Int64ObservableGauge(
"relayer.DiskUsageBytes",
metric.WithInt64Callback(func(context context.Context, result metric.Int64Observer) error {
usage, err := disk.Usage("/")
if err != nil {
return err
}

result.Observe(int64(usage.Used), opts)
return nil
}),
metric.WithDescription("Disk space used by the relayer in bytes."),
)
if err != nil {
return nil, err
}
totalDiskGauge, err := meter.Int64ObservableGauge(
"relayer.TotalDiskBytes",
metric.WithInt64Callback(func(context context.Context, result metric.Int64Observer) error {
usage, err := disk.Usage("/")
if err != nil {
return err
}

result.Observe(int64(usage.Total), opts)
return nil
}),
metric.WithDescription("Total relayer disk space."),
)
if err != nil {
return nil, err
}

networkIOReceivedGauge, err := meter.Int64ObservableGauge(
"relayer.NetworkIOBytesReceived",
metric.WithInt64Callback(func(context context.Context, result metric.Int64Observer) error {
stat, err := net.IOCounters(false)
if err != nil {
return err
}

result.Observe(int64(stat[0].BytesRecv), opts)
return nil
}),
metric.WithDescription("Total network bytes received."),
)
if err != nil {
return nil, err
}
networkIOSentGauge, err := meter.Int64ObservableGauge(
"relayer.NetworkIOBytesSent",
metric.WithInt64Callback(func(context context.Context, result metric.Int64Observer) error {
stat, err := net.IOCounters(false)
if err != nil {
return err
}

result.Observe(int64(stat[0].BytesSent), opts)
return nil
}),
metric.WithDescription("Total network bytes sent."),
)
if err != nil {
return nil, err
}

gcDurationHistogram, err := meter.Float64Histogram(
"relayer.GcDurationSeconds",
metric.WithDescription("Duration of garbage collection cycles."),
)
if err != nil {
return nil, err
}

m := &SystemMetrics{
opts: opts,
goRoutinesGauge: goRoutinesGauge,
totalMemoryGauge: totalMemoryGauge,
usedMemoryGauge: usedMemoryGauge,
gcDurationHistogram: gcDurationHistogram,
cpuUsageGauge: cpuUsageGauge,
totalDiskGauge: totalDiskGauge,
diskUsageGauge: diskUsageGauge,
networkIOReceivedGauge: networkIOReceivedGauge,
networkIOSentGauge: networkIOSentGauge,
}

go m.updateGCStats(ctx)
return m, err
}

func (m *SystemMetrics) updateGCStats(ctx context.Context) {
ticker := time.NewTicker(GC_STATS_UPDATE_PERIOD)
var previousPauseDuration float64
for {
select {
case <-ticker.C:
{
var gcStats debug.GCStats
debug.ReadGCStats(&gcStats)
if len(gcStats.Pause) == 0 {
continue
}

recentPauseDuration := gcStats.Pause[0].Seconds()
if recentPauseDuration == previousPauseDuration {
continue
}

m.gcDurationHistogram.Record(context.Background(), recentPauseDuration, m.opts)
previousPauseDuration = recentPauseDuration
}
case <-ctx.Done():
return

}
}
}
Loading