Skip to content

Commit 0701b5b

Browse files
committed
feat(metrics): add histogram metric type for packet size distribution
Replace 20 individual counter metrics for packet size buckets with two native Prometheus histograms (RX/TX). This maps SAI port stat fields to cumulative histogram buckets via a new `histogram` transform in the metrics config.
1 parent 0d81c06 commit 0701b5b

12 files changed

Lines changed: 500 additions & 203 deletions

File tree

.gitignore

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -5,6 +5,7 @@
55
*.so
66
*.dylib
77
bin/*
8+
/agent
89
Dockerfile.cross
910
dev
1011

docs/usage/metrics.md

Lines changed: 26 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -62,7 +62,8 @@ These are defined in YAML and can be customized or extended by operators. The de
6262
| `sonic_switch_interface_fec_frames_total` | counter | `interface`, `type` | FEC frame counters (correctable, uncorrectable, symbol_errors) |
6363
| `sonic_switch_interface_queue_length` | gauge | `interface` | Current output queue length |
6464
| `sonic_switch_interface_pfc_packets_total` | counter | `interface`, `direction`, `priority` | PFC packets per priority (0-7) |
65-
| `sonic_switch_interface_packet_size_total` | counter | `interface`, `direction`, `size` | Packets by size bucket |
65+
| `sonic_switch_interface_rx_packet_size_bytes` | histogram | `interface` | RX packet size distribution (buckets: 64, 127, 255, 511, 1023, 1518, 2047, 4095, 9216, 16383) |
66+
| `sonic_switch_interface_tx_packet_size_bytes` | histogram | `interface` | TX packet size distribution (buckets: 64, 127, 255, 511, 1023, 1518, 2047, 4095, 9216, 16383) |
6667
| `sonic_switch_interface_anomaly_packets_total` | counter | `interface`, `type` | Anomalous packets (undersize, oversize, fragments, jabbers, unknown_protos) |
6768

6869
## Metrics configuration schema
@@ -99,7 +100,7 @@ Each entry maps a Redis hash field (or set of fields) to a Prometheus metric.
99100
| `field` | no | — | Specific Redis hash field name. Mutually exclusive with `field_pattern` |
100101
| `field_pattern` | no | — | Set to `*` to iterate all hash fields. Mutually exclusive with `field` |
101102
| `metric` | yes | — | Prometheus metric name |
102-
| `type` | yes | — | `gauge` or `counter` |
103+
| `type` | yes | — | `gauge`, `counter`, or `histogram` |
103104
| `help` | no | — | Metric help string |
104105
| `value` | no | — | Fixed metric value (ignores field value). Use for `_info` pattern metrics |
105106
| `labels` | no | — | Map of label names to [value templates](#label-value-templates) |
@@ -191,6 +192,29 @@ transform:
191192
dom_flag_severity: true
192193
```
193194

195+
#### `histogram`
196+
197+
Maps multiple Redis hash fields to a single Prometheus histogram. Each entry in `buckets` maps an upper bound (float64) to a Redis hash field name. The transform reads each field, parses the count as an unsigned integer, and accumulates cumulative bucket counts. The resulting histogram has `sum=0` because SAI counters don't provide total bytes — but bucket-based percentile queries and heatmap visualizations still work. Requires `type: "histogram"`.
198+
199+
```yaml
200+
- metric: sonic_switch_interface_rx_packet_size_bytes
201+
type: histogram
202+
help: "RX packet size distribution"
203+
labels:
204+
interface: "$port_name"
205+
transform:
206+
histogram:
207+
buckets:
208+
64: SAI_PORT_STAT_ETHER_IN_PKTS_64_OCTETS
209+
127: SAI_PORT_STAT_ETHER_IN_PKTS_65_TO_127_OCTETS
210+
255: SAI_PORT_STAT_ETHER_IN_PKTS_128_TO_255_OCTETS
211+
511: SAI_PORT_STAT_ETHER_IN_PKTS_256_TO_511_OCTETS
212+
1023: SAI_PORT_STAT_ETHER_IN_PKTS_512_TO_1023_OCTETS
213+
1518: SAI_PORT_STAT_ETHER_IN_PKTS_1024_TO_1518_OCTETS
214+
```
215+
216+
This emits `_bucket`, `_count`, and `_sum` series automatically — Prometheus handles the histogram suffixes.
217+
194218
## Examples
195219

196220
### Adding a new counter from COUNTERS_DB

internal/agent/agent_server/server.go

Lines changed: 24 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -5,11 +5,16 @@ package agent_server
55

66
import (
77
"context"
8+
"errors"
89
"flag"
910
"fmt"
1011
"log"
1112
"net"
1213
"net/http"
14+
"os"
15+
"os/signal"
16+
"syscall"
17+
"time"
1318

1419
switchAgent "github.com/ironcore-dev/sonic-operator/internal/agent/interface"
1520
"github.com/ironcore-dev/sonic-operator/internal/agent/metrics"
@@ -308,7 +313,7 @@ func StartServer() {
308313
metricsSrv := metrics.NewMetricsServer(fmt.Sprintf("0.0.0.0:%d", *metricsPort), swAgent, sonic.GetSonicVersionInfo, *metricsConfig)
309314
go func() {
310315
log.Printf("metrics server listening at 0.0.0.0:%d", *metricsPort)
311-
if err := metricsSrv.ListenAndServe(); err != nil && err != http.ErrServerClosed {
316+
if err := metricsSrv.ListenAndServe(); err != nil && !errors.Is(err, http.ErrServerClosed) {
312317
log.Fatalf("metrics server failed: %v", err)
313318
}
314319
}()
@@ -318,6 +323,24 @@ func StartServer() {
318323
// Register reflection service on gRPC server for debugging
319324
reflection.Register(s)
320325

326+
// Handle OS signals for graceful shutdown
327+
sigCh := make(chan os.Signal, 1)
328+
signal.Notify(sigCh, syscall.SIGINT, syscall.SIGTERM)
329+
go func() {
330+
sig := <-sigCh
331+
log.Printf("received signal %v, shutting down", sig)
332+
333+
// Gracefully stop the gRPC server (drains in-flight RPCs)
334+
s.GracefulStop()
335+
336+
// Shut down the metrics HTTP server with a timeout
337+
ctx, cancel := context.WithTimeout(context.Background(), 5*time.Second)
338+
defer cancel()
339+
if err := metricsSrv.Shutdown(ctx); err != nil {
340+
log.Printf("metrics server shutdown error: %v", err)
341+
}
342+
}()
343+
321344
log.Printf("gRPC server listening at %v", lis.Addr())
322345
if err := s.Serve(lis); err != nil {
323346
log.Fatalf("failed to serve: %v", err)

internal/agent/metrics/collector.go

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,4 @@
1-
// SPDX-FileCopyrightText: 2025 SAP SE or an SAP affiliate company and IronCore contributors
1+
// SPDX-FileCopyrightText: 2026 SAP SE or an SAP affiliate company and IronCore contributors
22
// SPDX-License-Identifier: Apache-2.0
33

44
package metrics

internal/agent/metrics/config.go

Lines changed: 41 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -1,13 +1,15 @@
1-
// SPDX-FileCopyrightText: 2025 SAP SE or an SAP affiliate company and IronCore contributors
1+
// SPDX-FileCopyrightText: 2026 SAP SE or an SAP affiliate company and IronCore contributors
22
// SPDX-License-Identifier: Apache-2.0
33

44
package metrics
55

66
import (
77
"embed"
8+
"encoding/json"
89
"fmt"
910
"os"
1011
"regexp"
12+
"strconv"
1113

1214
"sigs.k8s.io/yaml"
1315
)
@@ -76,6 +78,8 @@ type Transform struct {
7678
RegexCapture *RegexCapture `json:"regex_capture,omitempty"`
7779
// DOMFlagSeverity computes a severity rollup (0=ok, 1=warning, 2=alarm) from all hash fields.
7880
DOMFlagSeverity bool `json:"dom_flag_severity,omitempty"`
81+
// Histogram maps upper bounds to Redis field names, emitting a Prometheus histogram.
82+
Histogram *HistogramBuckets `json:"histogram,omitempty"`
7983
}
8084

8185
// RegexCapture defines a regex-based field name matching transform.
@@ -86,6 +90,35 @@ type RegexCapture struct {
8690
Pattern string `json:"pattern"`
8791
}
8892

93+
// HistogramBuckets defines a histogram transform that maps Redis field names to
94+
// Prometheus histogram bucket upper bounds.
95+
type HistogramBuckets struct {
96+
// Buckets maps upper bounds (in bytes, seconds, etc.) to Redis hash field names.
97+
// Values are read, parsed as uint64, and accumulated into cumulative histogram buckets.
98+
Buckets map[float64]string `json:"buckets"`
99+
}
100+
101+
// UnmarshalJSON implements custom JSON unmarshaling for HistogramBuckets.
102+
// sigs.k8s.io/yaml converts YAML→JSON, so numeric YAML keys become JSON string keys.
103+
// This method parses those string keys back to float64.
104+
func (hb *HistogramBuckets) UnmarshalJSON(data []byte) error {
105+
var raw struct {
106+
Buckets map[string]string `json:"buckets"`
107+
}
108+
if err := json.Unmarshal(data, &raw); err != nil {
109+
return err
110+
}
111+
hb.Buckets = make(map[float64]string, len(raw.Buckets))
112+
for k, v := range raw.Buckets {
113+
f, err := strconv.ParseFloat(k, 64)
114+
if err != nil {
115+
return fmt.Errorf("histogram bucket key %q is not a valid number: %w", k, err)
116+
}
117+
hb.Buckets[f] = v
118+
}
119+
return nil
120+
}
121+
89122
// effectiveSeparator returns the key separator, defaulting to "|".
90123
func (m *MetricMapping) effectiveSeparator() string {
91124
if m.KeySeparator != "" {
@@ -135,8 +168,13 @@ func validateConfig(cfg *MetricsConfig) error {
135168
if f.Metric == "" {
136169
return fmt.Errorf("metrics[%d].fields[%d]: metric is required", i, j)
137170
}
138-
if f.Type != "gauge" && f.Type != "counter" {
139-
return fmt.Errorf("metrics[%d].fields[%d]: type must be 'gauge' or 'counter', got %q", i, j, f.Type)
171+
if f.Type != metricTypeGauge && f.Type != metricTypeCounter && f.Type != metricTypeHistogram {
172+
return fmt.Errorf("metrics[%d].fields[%d]: type must be 'gauge', 'counter', or 'histogram', got %q", i, j, f.Type)
173+
}
174+
if f.Type == metricTypeHistogram {
175+
if f.Transform == nil || f.Transform.Histogram == nil || len(f.Transform.Histogram.Buckets) == 0 {
176+
return fmt.Errorf("metrics[%d].fields[%d]: histogram type requires transform.histogram.buckets", i, j)
177+
}
140178
}
141179
if f.Field != "" && f.FieldPattern != "" {
142180
return fmt.Errorf("metrics[%d].fields[%d]: field and field_pattern are mutually exclusive", i, j)

internal/agent/metrics/config_collector.go

Lines changed: 60 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,4 @@
1-
// SPDX-FileCopyrightText: 2025 SAP SE or an SAP affiliate company and IronCore contributors
1+
// SPDX-FileCopyrightText: 2026 SAP SE or an SAP affiliate company and IronCore contributors
22
// SPDX-License-Identifier: Apache-2.0
33

44
package metrics
@@ -35,7 +35,7 @@ func NewConfigCollector(connector RedisConnector, mapping MetricMapping) *Config
3535
if _, exists := descs[f.Metric]; exists {
3636
continue
3737
}
38-
labels := sortedLabelNames(f, mapping.KeyResolver != "")
38+
labels := sortedLabelNames(f)
3939
// If this field uses parse_threshold_field, additional labels are added dynamically
4040
if f.Transform != nil && f.Transform.ParseThresholdField {
4141
labels = appendUnique(labels, "sensor", "level", "direction")
@@ -96,7 +96,11 @@ func (c *ConfigCollector) Collect(ch chan<- prometheus.Metric) {
9696
}
9797
}
9898

99-
// Fetch all matching keys
99+
// Fetch all matching keys.
100+
// NOTE: Redis KEYS is O(N) and blocks the single-threaded event loop. This is
101+
// acceptable for SONiC's embedded Redis where key counts are bounded by the number
102+
// of physical ports/transceivers (typically <256). For larger key spaces, consider
103+
// replacing with SCAN.
100104
keys, err := client.Keys(ctx, c.mapping.KeyPattern).Result()
101105
if err != nil {
102106
log.Printf("ConfigCollector[%s]: failed to list keys: %v",
@@ -136,6 +140,14 @@ func (c *ConfigCollector) Collect(ch chan<- prometheus.Metric) {
136140
continue
137141
}
138142

143+
// histogram operates on the whole hash — reads specific bucket fields
144+
if fm.Type == "histogram" && fm.Transform != nil && fm.Transform.Histogram != nil {
145+
desc := c.descs[fm.Metric]
146+
labels := resolveLabels(fm.Labels, keySuffix, portName, "", fields)
147+
collectHistogram(ch, desc, fm.Transform.Histogram, fields, labels)
148+
continue
149+
}
150+
139151
if fm.FieldPattern == "*" {
140152
// Iterate all fields
141153
c.collectAllFields(ch, fi, fm, fields, keySuffix, portName)
@@ -288,7 +300,7 @@ func resolveTemplate(tmpl, keySuffix, portName, fieldName string, hashFields map
288300
}
289301

290302
// sortedLabelNames extracts label names from a FieldMapping in sorted order.
291-
func sortedLabelNames(f FieldMapping, _ bool) []string {
303+
func sortedLabelNames(f FieldMapping) []string {
292304
names := make([]string, 0, len(f.Labels))
293305
for name := range f.Labels {
294306
names = append(names, name)
@@ -311,3 +323,47 @@ func appendUnique(slice []string, items ...string) []string {
311323
}
312324
return slice
313325
}
326+
327+
// collectHistogram reads bucket fields from the hash, accumulates cumulative counts,
328+
// and emits a prometheus.MustNewConstHistogram.
329+
func collectHistogram(
330+
ch chan<- prometheus.Metric,
331+
desc *prometheus.Desc,
332+
hb *HistogramBuckets,
333+
hashFields map[string]string,
334+
labels []string,
335+
) {
336+
// Sort upper bounds
337+
bounds := make([]float64, 0, len(hb.Buckets))
338+
for ub := range hb.Buckets {
339+
bounds = append(bounds, ub)
340+
}
341+
sort.Float64s(bounds)
342+
343+
// Read non-cumulative counts from Redis and accumulate into cumulative buckets.
344+
var totalCount uint64
345+
cumBuckets := make(map[float64]uint64, len(bounds))
346+
var cumulative uint64
347+
for _, ub := range bounds {
348+
fieldName := hb.Buckets[ub]
349+
val, ok := hashFields[fieldName]
350+
if !ok {
351+
cumBuckets[ub] = cumulative
352+
continue
353+
}
354+
n, err := strconv.ParseUint(val, 10, 64)
355+
if err != nil {
356+
cumBuckets[ub] = cumulative
357+
continue
358+
}
359+
cumulative += n
360+
cumBuckets[ub] = cumulative
361+
}
362+
totalCount = cumulative
363+
364+
// +Inf bucket count equals totalCount (Prometheus adds it automatically).
365+
// sum is 0 — SAI doesn't provide total bytes, only bucket counts.
366+
ch <- prometheus.MustNewConstHistogram(
367+
desc, totalCount, 0, cumBuckets, labels...,
368+
)
369+
}

0 commit comments

Comments
 (0)