Skip to content

Commit 1990fd5

Browse files
committed
feat: boot sequence health check and Prometheus health metrics
Add a boot sequence health check that will cause the app to abort if any of the following requirements aren't met: - Filesystem write permissions - Redis connectivity - LiveKit connectivity (when enabled - see livekit.healthCheck) Additionally, add a health metric to the Prometheus exporter called `recorder_component_health`. It's a gauge with two possible labels: - livekit - redis. 1 means healthy, 0 means unhealthy.
1 parent 7f7b3e6 commit 1990fd5

File tree

9 files changed

+165
-7
lines changed

9 files changed

+165
-7
lines changed

CHANGELOG.md

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -2,6 +2,10 @@
22

33
All notable changes to this project will be documented in this file.
44

5+
### UNRELEASED
6+
7+
* feat: boot sequence health check and Prometheus health metrics
8+
59
### v0.9.4
610

711
* feat(livekit): add Prom counters for reconnects and sub failures

config/bbb-webrtc-recorder.yml

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -51,3 +51,6 @@ livekit:
5151
host: ws://localhost:7880
5252
apiKey: ""
5353
apiSecret: ""
54+
healthCheck:
55+
enable: false
56+
interval: 1m

internal/app/app.go

Lines changed: 40 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -5,12 +5,15 @@ import (
55
"os"
66
"os/signal"
77
"syscall"
8+
"time"
89

910
"github.com/bigbluebutton/bbb-webrtc-recorder/internal"
1011
"github.com/bigbluebutton/bbb-webrtc-recorder/internal/appstats"
1112
"github.com/bigbluebutton/bbb-webrtc-recorder/internal/config"
1213
"github.com/bigbluebutton/bbb-webrtc-recorder/internal/pubsub"
1314
"github.com/bigbluebutton/bbb-webrtc-recorder/internal/server"
15+
"github.com/bigbluebutton/bbb-webrtc-recorder/internal/webrtc/livekit"
16+
"github.com/bigbluebutton/bbb-webrtc-recorder/internal/webrtc/recorder"
1417
"github.com/google/uuid"
1518
log "github.com/sirupsen/logrus"
1619
flag "github.com/spf13/pflag"
@@ -82,6 +85,43 @@ func Run() {
8285

8386
ps = pubsub.NewPubSub(cfg.PubSub)
8487

88+
if err := ps.Check(); err != nil {
89+
log.Fatalf("failed to connect to pubsub: %v", err)
90+
}
91+
92+
if err := recorder.CheckFsPermissions(cfg.Recorder); err != nil {
93+
log.Fatalf("failed to check recorder filesystem permissions: %v", err)
94+
}
95+
96+
if cfg.LiveKit.HealthCheck.Enable {
97+
log.WithField("interval", cfg.LiveKit.HealthCheck.Interval).
98+
WithField("host", cfg.LiveKit.Host).
99+
Debug("LiveKit health check enabled")
100+
101+
if err := livekit.CheckConnectivity(cfg.LiveKit); err != nil {
102+
log.Fatalf("failed to connect to LiveKit: %v", err)
103+
}
104+
105+
appstats.SetComponentHealth("livekit", true)
106+
log.WithField("host", cfg.LiveKit.Host).
107+
Info("LiveKit is ready")
108+
109+
go func() {
110+
ticker := time.NewTicker(cfg.LiveKit.HealthCheck.Interval)
111+
defer ticker.Stop()
112+
for {
113+
<-ticker.C
114+
if err := livekit.CheckConnectivity(cfg.LiveKit); err != nil {
115+
log.Warnf("livekit health check failed: %v", err)
116+
appstats.SetComponentHealth("livekit", false)
117+
} else {
118+
log.Trace("livekit health check succeeded")
119+
appstats.SetComponentHealth("livekit", true)
120+
}
121+
}
122+
}()
123+
}
124+
85125
if cfg.HTTP.Enable {
86126
hs := server.NewHTTPServer(cfg, ps)
87127
hs.Serve()

internal/appstats/prometheus.go

Lines changed: 20 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -190,6 +190,15 @@ var (
190190
[]string{
191191
"error", // error string
192192
})
193+
194+
ComponentHealth = prometheus.NewGaugeVec(prometheus.GaugeOpts{
195+
Subsystem: "recorder",
196+
Name: "component_health",
197+
Help: "Health status of recorder components (1 = healthy, 0 = unhealthy)",
198+
},
199+
[]string{
200+
"component",
201+
})
193202
)
194203

195204
func Init() {
@@ -211,6 +220,7 @@ func Init() {
211220
prometheus.MustRegister(ParticipantReconnects)
212221
prometheus.MustRegister(TrackSubscriptionFailures)
213222
prometheus.MustRegister(ParticipantReconnectingEvents)
223+
prometheus.MustRegister(ComponentHealth)
214224
}
215225

216226
func newMetricsHandler() *metricsHandler {
@@ -322,6 +332,16 @@ func OnServerResponse(msg interface{}) {
322332
}
323333
}
324334

335+
func SetComponentHealth(component string, healthy bool) {
336+
status := 0.0
337+
338+
if healthy {
339+
status = 1.0
340+
}
341+
342+
ComponentHealth.WithLabelValues(component).Set(status)
343+
}
344+
325345
func UpdateCaptureMetrics(stats *CaptureStats) {
326346
if stats == nil {
327347
return

internal/config/config.go

Lines changed: 15 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -80,6 +80,10 @@ func (cfg *Config) SetDefaults() {
8080
APISecret: "",
8181
PacketReadTimeout: 500 * time.Millisecond,
8282
PreferredVideoQuality: livekit.VideoQuality_HIGH,
83+
HealthCheck: HealthCheck{
84+
Enable: false,
85+
Interval: 1 * time.Minute,
86+
},
8387
}
8488
}
8589

@@ -131,11 +135,17 @@ type Prometheus struct {
131135
}
132136

133137
type LiveKit struct {
134-
Host string `mapstructure:"host"`
135-
APIKey string `mapstructure:"api_key"`
136-
APISecret string `mapstructure:"api_secret"`
137-
PacketReadTimeout time.Duration `mapstructure:"packet_read_timeout"`
138-
PreferredVideoQuality livekit.VideoQuality `mapstructure:"preferred_video_quality"`
138+
Host string `yaml:"host,omitempty" mapstructure:"host"`
139+
APIKey string `yaml:"apiKey,omitempty" mapstructure:"api_key"`
140+
APISecret string `yaml:"apiSecret,omitempty" mapstructure:"api_secret"`
141+
PacketReadTimeout time.Duration `yaml:"packetReadTimeout,omitempty" mapstructure:"packet_read_timeout"`
142+
PreferredVideoQuality livekit.VideoQuality `yaml:"preferredVideoQuality,omitempty" mapstructure:"preferred_video_quality"`
143+
HealthCheck HealthCheck `yaml:"healthCheck,omitempty"`
144+
}
145+
146+
type HealthCheck struct {
147+
Enable bool `yaml:"enable,omitempty"`
148+
Interval time.Duration `yaml:"interval,omitempty"`
139149
}
140150

141151
type LogConfig struct {

internal/pubsub/pubsub.go

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -13,6 +13,7 @@ type PubSub interface {
1313
Subscribe(channel string, handler PubSubHandler, onStart func() error) error
1414
Publish(channel string, message []byte) error
1515
Close() error
16+
Check() error
1617
}
1718

1819
type PubSubHandler func(ctx context.Context, message []byte)

internal/pubsub/redis.go

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -4,6 +4,7 @@ import (
44
"context"
55
"time"
66

7+
"github.com/bigbluebutton/bbb-webrtc-recorder/internal/appstats"
78
"github.com/bigbluebutton/bbb-webrtc-recorder/internal/config"
89
"github.com/bigbluebutton/bbb-webrtc-recorder/internal/pubsub/redis"
910
"github.com/cenkalti/backoff/v4"
@@ -37,6 +38,7 @@ func (r *Redis) Subscribe(channel string, handler PubSubHandler, onStart func()
3738
log.Infof("Subscribed to pubsub %s on redis %s", channel, r.config.Address)
3839
r.hasConnected = true
3940
eb.Reset()
41+
appstats.SetComponentHealth("redis", true)
4042

4143
return onStart()
4244
},
@@ -59,6 +61,7 @@ func (r *Redis) Subscribe(channel string, handler PubSubHandler, onStart func()
5961
log.Errorf("failed to subscribe to pubsub %s on %s: %s", channel, r.config.Address, err)
6062
return err
6163
} else {
64+
appstats.SetComponentHealth("redis", false)
6265
log.Errorf("failed to subscribe to pubsub %s on %s: %s - retrying in %s", channel, r.config.Address, err, next)
6366
}
6467

@@ -79,6 +82,10 @@ func (r *Redis) Publish(channel string, message []byte) error {
7982
return r.pubsub.Publish(channel, message)
8083
}
8184

85+
func (r *Redis) Check() error {
86+
return r.pubsub.Check()
87+
}
88+
8289
func NewRedis(cfg config.Redis) *Redis {
8390
r := &Redis{config: cfg}
8491
if p, err := redis.NewPubSub(cfg.Network, cfg.Address, cfg.Password); err != nil {

internal/pubsub/redis/redis.go

Lines changed: 16 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -119,6 +119,22 @@ loop:
119119
return <-done
120120
}
121121

122+
func (p *PubSub) Check() error {
123+
c, err := p.dial()
124+
125+
if err != nil {
126+
return err
127+
}
128+
129+
defer c.Close()
130+
131+
if _, err = c.Do("PING"); err != nil {
132+
return err
133+
}
134+
135+
return nil
136+
}
137+
122138
func (p *PubSub) Publish(channel string, message []byte) error {
123139
c, err := p.dial()
124140
if err != nil {

internal/webrtc/recorder/recorder.go

Lines changed: 59 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -12,6 +12,7 @@ import (
1212
"github.com/bigbluebutton/bbb-webrtc-recorder/internal/config"
1313
"github.com/bigbluebutton/bbb-webrtc-recorder/internal/webrtc/interfaces"
1414
"github.com/pion/rtp"
15+
log "github.com/sirupsen/logrus"
1516
)
1617

1718
// KeyframeRequester defines the interface for requesting keyframes
@@ -34,11 +35,41 @@ type Recorder interface {
3435
Close() time.Duration
3536
}
3637

38+
func CheckFsPermissions(cfg config.Recorder) error {
39+
dir := path.Clean(cfg.Directory)
40+
41+
if err := checkDirectory(dir); err != nil {
42+
return err
43+
}
44+
45+
fileMode, _ := parseFileMode(cfg.FileMode)
46+
47+
tmpFile, err := os.CreateTemp(dir, ".rec-file-perm-check-*")
48+
49+
if err != nil {
50+
return fmt.Errorf("recorder directory is not writable: %w", err)
51+
}
52+
53+
defer func() {
54+
_ = tmpFile.Close()
55+
if err := os.Remove(tmpFile.Name()); err != nil {
56+
log.WithField("file", tmpFile.Name()).Warnf("could not remove permission check file: %v", err)
57+
}
58+
}()
59+
60+
// Check if the configured file mode can be applied
61+
if err := tmpFile.Chmod(fileMode); err != nil {
62+
return fmt.Errorf("cannot apply file mode %s: %w", cfg.FileMode, err)
63+
}
64+
65+
return nil
66+
}
67+
3768
func ValidateAndPrepareFile(ctx context.Context, cfg config.Recorder, file string) (string, os.FileMode, error) {
3869
dir := path.Clean(cfg.Directory)
3970

40-
if _, err := os.Stat(dir); os.IsNotExist(err) {
41-
return "", 0, fmt.Errorf("directory does not exist %s", cfg.Directory)
71+
if err := checkDirectory(dir); err != nil {
72+
return "", 0, err
4273
}
4374

4475
if !cfg.WriteToDevNull {
@@ -108,3 +139,29 @@ func NewRecorder(ctx context.Context, cfg config.Recorder, file string) (Recorde
108139
}
109140
return r, nil
110141
}
142+
143+
func parseFileMode(mode string) (os.FileMode, error) {
144+
if parsedFileMode, err := strconv.ParseUint(mode, 0, 32); err != nil {
145+
return 0, fmt.Errorf("invalid file mode %s", mode)
146+
} else {
147+
return os.FileMode(parsedFileMode), nil
148+
}
149+
}
150+
151+
func checkDirectory(dir string) error {
152+
if fileInfo, err := os.Stat(dir); err != nil {
153+
if os.IsNotExist(err) {
154+
return fmt.Errorf("recorder directory does not exist: %s", dir)
155+
}
156+
157+
if err != nil {
158+
return fmt.Errorf("could not stat recorder directory %s: %w", dir, err)
159+
}
160+
} else {
161+
if !fileInfo.IsDir() {
162+
return fmt.Errorf("recorder path is not a directory: %s", dir)
163+
}
164+
}
165+
166+
return nil
167+
}

0 commit comments

Comments
 (0)